whispercpp 1.3.5 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1017) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/LICENSE +1 -1
  5. data/README.md +133 -3
  6. data/Rakefile +18 -3
  7. data/ext/dependencies.rb +10 -4
  8. data/ext/dependencies_for_windows.rb +17 -0
  9. data/ext/extconf.rb +20 -7
  10. data/ext/options.rb +54 -14
  11. data/ext/options_for_windows.rb +51 -0
  12. data/ext/ruby_whisper.c +56 -46
  13. data/ext/ruby_whisper.h +165 -2
  14. data/ext/ruby_whisper_context.c +297 -126
  15. data/ext/ruby_whisper_context_params.c +163 -0
  16. data/ext/ruby_whisper_log_queue.c +180 -0
  17. data/ext/ruby_whisper_log_settable.h +47 -0
  18. data/ext/ruby_whisper_model.c +0 -1
  19. data/ext/ruby_whisper_parakeet.c +49 -0
  20. data/ext/ruby_whisper_parakeet_context.c +304 -0
  21. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  22. data/ext/ruby_whisper_parakeet_model.c +84 -0
  23. data/ext/ruby_whisper_parakeet_params.c +548 -0
  24. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  25. data/ext/ruby_whisper_parakeet_token.c +188 -0
  26. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  27. data/ext/ruby_whisper_params.c +256 -66
  28. data/ext/ruby_whisper_segment.c +6 -7
  29. data/ext/ruby_whisper_token.c +29 -9
  30. data/ext/ruby_whisper_transcribe.cpp +46 -16
  31. data/ext/ruby_whisper_vad_context.c +48 -1
  32. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  33. data/ext/ruby_whisper_vad_params.c +0 -1
  34. data/ext/ruby_whisper_vad_segment.c +0 -1
  35. data/ext/ruby_whisper_vad_segments.c +0 -1
  36. data/ext/sources/CMakeLists.txt +41 -3
  37. data/ext/sources/CMakePresets.json +95 -0
  38. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  39. data/ext/sources/cmake/parakeet.pc.in +10 -0
  40. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  41. data/ext/sources/cmake/whisper.pc.in +1 -1
  42. data/ext/sources/examples/CMakeLists.txt +4 -2
  43. data/ext/sources/examples/bench/bench.cpp +24 -19
  44. data/ext/sources/examples/cli/cli.cpp +51 -9
  45. data/ext/sources/examples/common-ggml.cpp +4 -0
  46. data/ext/sources/examples/common-whisper.cpp +139 -67
  47. data/ext/sources/examples/common-whisper.h +11 -0
  48. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  49. data/ext/sources/examples/miniaudio.h +4507 -2131
  50. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  51. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  52. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  53. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  54. data/ext/sources/examples/server/server.cpp +213 -163
  55. data/ext/sources/ggml/CMakeLists.txt +29 -15
  56. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  57. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  58. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  59. data/ext/sources/ggml/include/ggml-backend.h +73 -11
  60. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  61. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  62. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  63. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  64. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  65. data/ext/sources/ggml/include/ggml-rpc.h +8 -3
  66. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  67. data/ext/sources/ggml/include/ggml.h +155 -16
  68. data/ext/sources/ggml/include/gguf.h +10 -2
  69. data/ext/sources/ggml/src/CMakeLists.txt +25 -5
  70. data/ext/sources/ggml/src/ggml-alloc.c +9 -10
  71. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  72. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  73. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  74. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
  76. data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
  79. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
  83. data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
  85. data/ext/sources/ggml/src/ggml-common.h +22 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
  88. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  89. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
  91. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
  92. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
  99. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
  101. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  102. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  103. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
  105. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
  106. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
  107. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  108. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
  109. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
  110. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
  111. data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
  112. data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
  113. data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
  115. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  116. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  117. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
  118. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  119. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  120. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  121. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  122. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  123. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  124. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  125. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  126. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  127. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  128. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  129. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  130. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  131. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  132. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  133. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  134. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
  135. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  136. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
  137. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  138. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  139. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
  140. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
  141. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  142. data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
  143. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  144. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  145. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  146. data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
  147. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  148. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
  149. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  150. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
  151. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
  152. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  153. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
  154. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
  155. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
  156. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  157. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
  158. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  159. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  160. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  161. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
  162. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  163. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  164. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
  165. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  166. data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
  167. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
  168. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  169. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
  170. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  171. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
  172. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
  173. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  174. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
  175. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
  176. data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
  177. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  178. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  179. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
  180. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  181. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
  182. data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
  183. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  184. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  185. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  186. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  187. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  188. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  189. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  190. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
  191. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  192. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  193. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
  226. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
  227. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  228. data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
  229. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
  230. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
  231. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  232. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
  233. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
  235. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  254. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  255. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  256. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  257. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  258. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  259. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  260. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  261. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  262. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
  263. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
  264. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  265. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  266. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  267. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  268. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  269. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  270. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  271. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  272. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  273. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  274. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  275. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  276. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  278. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  297. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  302. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
  303. data/ext/sources/ggml/src/ggml-impl.h +68 -1
  304. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  305. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  306. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  307. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  308. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
  309. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
  310. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
  311. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
  312. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
  313. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
  314. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
  315. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
  316. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  317. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
  318. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  367. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  368. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  369. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  370. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  371. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  372. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  373. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  374. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  375. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  376. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  377. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  378. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  379. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  380. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  381. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  382. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  383. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  384. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  385. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  386. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  387. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  388. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  389. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
  390. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  391. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  392. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  393. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  394. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  395. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  396. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  397. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  398. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  399. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  400. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  401. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  402. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  403. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  404. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  405. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  406. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  407. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  412. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  413. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  414. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  415. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  416. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  417. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  418. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  419. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  420. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  421. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  422. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  423. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  424. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  425. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  426. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  427. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  428. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  429. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  430. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  431. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  432. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  433. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  434. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  435. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  436. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  437. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  438. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  439. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  440. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  441. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  442. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  443. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  444. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
  445. data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
  446. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  447. data/ext/sources/ggml/src/ggml-quants.c +385 -119
  448. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  449. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  450. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  451. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  452. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  453. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
  454. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  455. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
  456. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  457. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  458. data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
  459. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
  460. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
  461. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  462. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  463. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  464. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  465. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  466. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  467. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  468. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  469. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
  470. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
  471. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  472. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  473. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  474. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  475. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  476. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  477. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
  478. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  479. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  480. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  481. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
  482. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  483. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  484. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  485. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
  486. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  487. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  488. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  489. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  490. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  491. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  492. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  493. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  494. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
  495. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  496. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  497. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  498. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  499. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  500. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  501. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  502. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  503. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  504. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  505. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  506. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  507. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  508. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  509. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  510. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  511. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  512. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  513. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  514. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  515. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  516. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  517. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  518. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  519. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  520. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  521. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  522. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  523. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  524. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  525. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  526. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  527. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  528. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  529. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  530. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  531. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  532. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  533. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  534. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  535. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  536. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  537. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  538. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  539. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  540. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  541. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  542. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  543. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  544. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  545. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  546. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  547. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  548. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  549. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  550. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  551. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  552. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  553. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  554. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  555. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
  556. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  557. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  558. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  559. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  560. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  561. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  562. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  563. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  564. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  565. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  566. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  567. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  568. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  569. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  570. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  571. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  572. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  573. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  574. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  575. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  576. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  577. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  578. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  579. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  580. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  581. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  582. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  583. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  584. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  585. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  586. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  587. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  588. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  589. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  590. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  591. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  592. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  593. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  594. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  595. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  596. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  597. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  598. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  599. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
  663. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  664. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
  665. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
  666. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  667. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  668. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  669. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  670. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  671. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
  672. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
  673. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
  674. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  675. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
  676. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  677. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
  678. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  679. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  680. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  681. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  682. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  683. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  684. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
  685. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  686. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  687. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
  688. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  689. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  690. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  691. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  692. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
  693. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
  694. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  695. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  696. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  697. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  698. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  699. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  700. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  701. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  702. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  703. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
  704. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  705. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
  706. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  707. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  708. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  709. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  710. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  711. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  712. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
  713. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  714. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
  715. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  716. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
  717. data/ext/sources/ggml/src/ggml.c +268 -52
  718. data/ext/sources/ggml/src/gguf.cpp +377 -47
  719. data/ext/sources/include/parakeet.h +342 -0
  720. data/ext/sources/include/whisper.h +10 -0
  721. data/ext/sources/media/matmul.png +0 -0
  722. data/ext/sources/src/CMakeLists.txt +23 -0
  723. data/ext/sources/src/parakeet-arch.h +188 -0
  724. data/ext/sources/src/parakeet.cpp +3838 -0
  725. data/ext/sources/src/whisper.cpp +62 -40
  726. data/extsources.rb +26 -10
  727. data/lib/whisper/log_settable.rb +36 -0
  728. data/lib/whisper/model/uri.rb +13 -1
  729. data/lib/whisper/output.rb +74 -0
  730. data/sig/whisper.rbs +445 -55
  731. data/test/helper.rb +2 -0
  732. data/test/jfk_reader/jfk_reader.c +50 -7
  733. data/test/test_callback.rb +1 -0
  734. data/test/test_context_params.rb +82 -0
  735. data/test/test_package.rb +6 -5
  736. data/test/test_parakeet.rb +28 -0
  737. data/test/test_parakeet_callback.rb +107 -0
  738. data/test/test_parakeet_context.rb +116 -0
  739. data/test/test_parakeet_context_params.rb +24 -0
  740. data/test/test_parakeet_model.rb +21 -0
  741. data/test/test_parakeet_params.rb +78 -0
  742. data/test/test_parakeet_segment.rb +42 -0
  743. data/test/test_parakeet_token.rb +73 -0
  744. data/test/test_params.rb +2 -0
  745. data/test/test_token.rb +11 -0
  746. data/test/test_vad_context.rb +58 -8
  747. data/test/test_vad_segment.rb +1 -1
  748. data/test/test_whisper.rb +44 -6
  749. data/whispercpp.gemspec +2 -2
  750. metadata +426 -280
  751. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  752. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  753. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  754. data/ext/sources/bindings/javascript/package.json +0 -26
  755. data/ext/sources/bindings/javascript/whisper.js +0 -19
  756. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  757. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  758. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  759. data/ext/sources/examples/addon.node/index.js +0 -59
  760. data/ext/sources/examples/addon.node/package.json +0 -16
  761. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  762. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  763. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  764. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  765. data/ext/sources/examples/coi-serviceworker.js +0 -146
  766. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  767. data/ext/sources/examples/command/command.cpp +0 -802
  768. data/ext/sources/examples/command/commands.txt +0 -9
  769. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  770. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  771. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  772. data/ext/sources/examples/generate-karaoke.sh +0 -57
  773. data/ext/sources/examples/helpers.js +0 -191
  774. data/ext/sources/examples/livestream.sh +0 -112
  775. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  776. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  777. data/ext/sources/examples/lsp/whisper.vim +0 -362
  778. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  779. data/ext/sources/examples/python/whisper_processor.py +0 -54
  780. data/ext/sources/examples/server/bench.js +0 -29
  781. data/ext/sources/examples/server.py +0 -120
  782. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  783. data/ext/sources/examples/stream/stream.cpp +0 -437
  784. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  785. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  786. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  787. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  788. data/ext/sources/examples/sycl/build.sh +0 -22
  789. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  790. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  791. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
  792. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  793. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
  794. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
  795. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
  796. data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
  797. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
  798. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  799. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
  800. data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
  801. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
  802. data/ext/sources/examples/talk-llama/llama-context.h +0 -360
  803. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  804. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
  805. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  806. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  807. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
  808. data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
  809. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
  810. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
  811. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  812. data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
  813. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  814. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  815. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
  816. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  817. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
  818. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
  819. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  820. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  821. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  822. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
  823. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  824. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  825. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  826. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
  827. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  828. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
  829. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
  830. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
  831. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
  832. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
  833. data/ext/sources/examples/talk-llama/llama-model.h +0 -544
  834. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
  835. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  836. data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
  837. data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
  838. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
  839. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
  840. data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
  841. data/ext/sources/examples/talk-llama/llama.h +0 -1540
  842. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
  843. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  844. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  845. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
  846. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  847. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
  848. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
  849. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
  850. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
  851. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
  852. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  853. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  854. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  855. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  856. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  857. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  858. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  859. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
  860. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  861. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
  862. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
  863. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
  864. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  865. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
  866. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  867. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  868. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  869. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
  870. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  871. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  872. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  873. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  874. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  875. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  876. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  877. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
  878. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  879. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  880. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
  881. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
  882. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
  883. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  884. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
  885. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  886. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
  887. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  888. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  889. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  890. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
  891. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  892. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  893. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  894. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
  895. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  896. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
  897. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
  898. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
  899. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
  900. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  901. data/ext/sources/examples/talk-llama/models/models.h +0 -569
  902. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
  903. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  904. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
  905. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  906. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  907. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  908. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  909. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  910. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  911. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  912. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  913. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  914. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  915. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  916. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  917. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
  918. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  919. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
  920. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  921. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  922. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  923. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  924. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
  925. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
  926. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
  927. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
  928. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
  929. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  930. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  931. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
  932. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  933. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  934. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
  935. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  936. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  937. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  938. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  939. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  940. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  941. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  942. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  943. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  944. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  945. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  946. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  947. data/ext/sources/examples/talk-llama/speak +0 -40
  948. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  949. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  950. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  951. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  952. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  953. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
  954. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  955. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  956. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  957. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  958. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  959. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  960. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  961. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  962. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  963. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  964. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  965. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  966. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  967. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  968. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  969. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  970. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
  971. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
  972. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  973. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  974. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  975. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  976. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  977. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  978. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  979. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  980. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  981. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  982. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  983. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  984. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
  985. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  986. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
  987. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
  988. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  989. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  990. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  991. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
  992. data/ext/sources/tests/CMakeLists.txt +0 -112
  993. data/ext/sources/tests/earnings21/eval.mk +0 -58
  994. data/ext/sources/tests/earnings21/eval.py +0 -68
  995. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  996. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  997. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  998. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  999. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  1000. data/ext/sources/tests/en-0-ref.txt +0 -1
  1001. data/ext/sources/tests/en-1-ref.txt +0 -1
  1002. data/ext/sources/tests/en-2-ref.txt +0 -1
  1003. data/ext/sources/tests/es-0-ref.txt +0 -1
  1004. data/ext/sources/tests/librispeech/eval.mk +0 -39
  1005. data/ext/sources/tests/librispeech/eval.py +0 -47
  1006. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  1007. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  1008. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  1009. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  1010. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  1011. data/ext/sources/tests/run-tests.sh +0 -130
  1012. data/ext/sources/tests/test-c.c +0 -3
  1013. data/ext/sources/tests/test-vad-full.cpp +0 -56
  1014. data/ext/sources/tests/test-vad.cpp +0 -83
  1015. data/ext/sources/tests/test-whisper.js +0 -58
  1016. data/lib/whisper/context.rb +0 -15
  1017. data/lib/whisper/segment.rb +0 -58
@@ -19,6 +19,7 @@
19
19
  #include <cstdlib>
20
20
  #include <float.h>
21
21
  #include <limits>
22
+ #include <optional>
22
23
  #include <stdint.h>
23
24
  #include <stdio.h>
24
25
  #include <vector>
@@ -30,11 +31,21 @@
30
31
  #include <regex>
31
32
 
32
33
  #include <sycl/sycl.hpp>
34
+ #include <sycl/backend.hpp>
35
+ #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
36
+ #include <level_zero/ze_api.h>
37
+ #endif
33
38
  #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
34
39
  # include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
35
40
  #endif
41
+ #if SYCL_EXT_ONEAPI_VIRTUAL_MEM
42
+ # include <sycl/ext/oneapi/virtual_mem/physical_mem.hpp>
43
+ # include <sycl/ext/oneapi/virtual_mem/virtual_mem.hpp>
44
+ # define GGML_SYCL_USE_VMM
45
+ #endif
36
46
  #include <sycl/half_type.hpp>
37
47
 
48
+ #include "ggml.h"
38
49
  #include "ggml-sycl.h"
39
50
  #include "ggml-impl.h"
40
51
  #include "ggml-backend-impl.h"
@@ -43,25 +54,35 @@
43
54
  #include "ggml-sycl/backend.hpp"
44
55
  #include "ggml-sycl/common.hpp"
45
56
  #include "ggml-sycl/element_wise.hpp"
57
+ #include "ggml-sycl/gemm.hpp"
58
+ #include "ggml-sycl/getrows.hpp"
46
59
  #include "ggml-sycl/norm.hpp"
47
60
  #include "ggml-sycl/presets.hpp"
48
- #include "ggml-sycl/gemm.hpp"
61
+ #include "ggml-sycl/quantize.hpp"
62
+ #include "ggml-sycl/repeat_back.hpp"
49
63
  #include "ggml-sycl/set_rows.hpp"
50
64
  #include "ggml-sycl/set.hpp"
51
- #include "ggml-sycl/sycl_hw.hpp"
52
- #include "ggml-sycl/getrows.hpp"
53
- #include "ggml-sycl/repeat_back.hpp"
54
- #include "ggml-sycl/quantize.hpp"
55
65
  #include "ggml-sycl/ssm_conv.hpp"
56
- #include "ggml.h"
66
+ #include "ggml-sycl/sycl_hw.hpp"
67
+ #include "ggml-sycl/ssm_scan.hpp"
68
+ #include "ggml-sycl/fill.hpp"
69
+ #include "ggml-sycl/cumsum.hpp"
70
+ #include "ggml-sycl/diag.hpp"
71
+ #include "ggml-sycl/solve_tri.hpp"
72
+ #include "ggml-sycl/gated_delta_net.hpp"
57
73
 
58
74
  static bool g_sycl_loaded = false;
59
75
  int g_ggml_sycl_debug = 0;
60
76
  int g_ggml_sycl_disable_optimize = 0;
61
77
  int g_ggml_sycl_disable_graph = 0;
62
78
  int g_ggml_sycl_disable_dnn = 0;
79
+ int g_ggml_sycl_enable_vmm = 1;
63
80
  int g_ggml_sycl_prioritize_dmmv = 0;
64
81
  int g_ggml_sycl_use_async_mem_op = 0;
82
+ int g_ggml_sycl_use_async_mem_op_requested = 1;
83
+ int g_ggml_sycl_enable_level_zero = 0;
84
+ int g_ggml_sycl_enable_flash_attention = 1;
85
+
65
86
 
66
87
  static ggml_sycl_device_info ggml_sycl_init() {
67
88
  ggml_sycl_device_info info = {};
@@ -82,23 +103,50 @@ static ggml_sycl_device_info ggml_sycl_init() {
82
103
  // GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
83
104
  // #endif
84
105
  for (int i = 0; i < info.device_count; ++i) {
85
- info.devices[i].vmm = 0;
86
106
  dpct::device_info prop;
87
- sycl::device device = dpct::dev_mgr::instance().get_device(i);
107
+ auto & device = dpct::dev_mgr::instance().get_device(i);
88
108
 
89
109
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
90
110
  prop, device)));
91
111
 
112
+ #if !defined(GGML_SYCL_USE_VMM)
113
+ info.devices[i].vmm = 0;
114
+ #else
115
+ info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem);
116
+ if (info.devices[i].vmm) {
117
+ // NB: SYCL's get_mem_granularity always returns the _minimum_ granularity,
118
+ // but the L0 API requires a larger page size for allocs above 2 MiB and
119
+ // rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic].
120
+ // Here we clamp it to 2 MiB for simplicity, but other devices may require
121
+ // calling zeVirtualMemQueryPageSize or yet unexposed public API.
122
+ const size_t physical_page = 2ull << 20; // 2 MiB
123
+ info.devices[i].vmm_granularity = std::max<size_t>(
124
+ sycl::ext::oneapi::experimental::get_mem_granularity(
125
+ device, sycl::context(device)),
126
+ physical_page);
127
+ }
128
+ #endif
129
+
92
130
  info.default_tensor_split[i] = total_vram;
93
131
  total_vram += prop.get_global_mem_size();
94
132
 
95
133
  info.devices[i].cc =
96
134
  100 * prop.get_major_version() + 10 * prop.get_minor_version();
97
- info.devices[i].nsm = prop.get_max_compute_units();
135
+ info.devices[i].nsm = prop.get_max_compute_units() / 16; //16: Number of Xe Cores
98
136
  info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
99
137
  info.devices[i].smpbo = prop.get_local_mem_size();
138
+ info.devices[i].warp_size = WARP_SIZE;
100
139
 
101
140
  info.max_work_group_sizes[i] = prop.get_max_work_group_size();
141
+ info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
142
+ info.devices[i].hw_info = get_device_hw_info(&device);
143
+
144
+ // Only check GPU devices; CPU devices use OpenCL and would otherwise
145
+ // disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
146
+ if (device.is_gpu() && device.default_queue().get_backend() != sycl::backend::ext_oneapi_level_zero) {
147
+ GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
148
+ info.ext_oneapi_level_zero = false;
149
+ }
102
150
  }
103
151
 
104
152
  for (int id = 0; id < info.device_count; ++id) {
@@ -210,8 +258,54 @@ static void ggml_check_sycl() try {
210
258
  g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
211
259
  g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
212
260
  g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
261
+ g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
213
262
  g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
263
+ #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
264
+ g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
265
+ #else
266
+ g_ggml_sycl_enable_level_zero = 0;
267
+ #endif
268
+
269
+ #ifdef SYCL_FLASH_ATTN
270
+ g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
271
+ #else
272
+ g_ggml_sycl_enable_flash_attention = 0;
273
+ #endif
274
+
214
275
  GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
276
+
277
+ GGML_LOG_INFO("Build with Macros:\n");
278
+ #if defined(GGML_SYCL_FORCE_MMQ)
279
+ GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
280
+ #else
281
+ GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: no\n");
282
+ #endif
283
+ #if defined(GGML_SYCL_F16)
284
+ GGML_LOG_INFO(" GGML_SYCL_F16: yes\n");
285
+ #else
286
+ GGML_LOG_INFO(" GGML_SYCL_F16: no\n");
287
+ #endif
288
+ #if defined(GGML_SYCL_GRAPH)
289
+ GGML_LOG_INFO(" GGML_SYCL_GRAPH: yes\n");
290
+ #else
291
+ GGML_LOG_INFO(" GGML_SYCL_GRAPH: no\n");
292
+ #endif
293
+ #if defined(GGML_SYCL_DNNL)
294
+ GGML_LOG_INFO(" GGML_SYCL_DNNL: yes\n");
295
+ #else
296
+ GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n");
297
+ #endif
298
+ #if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO)
299
+ GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n");
300
+ #else
301
+ GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
302
+ #endif
303
+ #if defined(GGML_SYCL_USE_VMM)
304
+ GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
305
+ #else
306
+ GGML_LOG_INFO(" GGML_SYCL_USE_VMM: no\n");
307
+ #endif
308
+
215
309
  GGML_LOG_INFO("Running with Environment Variables:\n");
216
310
  GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
217
311
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
@@ -220,22 +314,30 @@ static void ggml_check_sycl() try {
220
314
  #else
221
315
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
222
316
  #endif
317
+ #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
318
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
319
+ #else
320
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
321
+ #endif
223
322
  #if GGML_SYCL_DNNL
224
323
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
225
324
  #else
226
325
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
227
326
  #endif
228
- GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
229
- GGML_LOG_INFO("Build with Macros:\n");
230
- #if defined(GGML_SYCL_FORCE_MMQ)
231
- GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
327
+ #if defined(GGML_SYCL_USE_VMM)
328
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm);
232
329
  #else
233
- GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: no\n");
330
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
234
331
  #endif
235
- #if defined(GGML_SYCL_F16)
236
- GGML_LOG_INFO(" GGML_SYCL_F16: yes\n");
332
+ GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
333
+ g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
334
+ GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested);
335
+
336
+ #ifdef SYCL_FLASH_ATTN
337
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention);
237
338
  #else
238
- GGML_LOG_INFO(" GGML_SYCL_F16: no\n");
339
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d disabled by compile flag\n",
340
+ g_ggml_sycl_enable_flash_attention);
239
341
  #endif
240
342
 
241
343
  /* NOT REMOVE, keep it for next optimize for XMX.
@@ -245,11 +347,11 @@ static void ggml_check_sycl() try {
245
347
  fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
246
348
  #endif
247
349
  */
248
- // Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
249
- // properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
250
- // other places.
350
+ // Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder
351
+ // staging path while preserving queue ordering semantics. Graph support still depends on the extension being
352
+ // available, but it no longer needs to control the non-graph fast path.
251
353
  #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
252
- g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
354
+ g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph;
253
355
  if (g_ggml_sycl_use_async_mem_op) {
254
356
  for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
255
357
  if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
@@ -333,7 +435,7 @@ struct ggml_backend_sycl_buffer_context {
333
435
  ~ggml_backend_sycl_buffer_context() {
334
436
  if (dev_ptr != nullptr) {
335
437
  ggml_sycl_set_device(device);
336
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
438
+ SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
337
439
  }
338
440
 
339
441
  //release extra used by tensors
@@ -379,11 +481,22 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
379
481
  assert(tensor->view_src->buffer->buft == buffer->buft);
380
482
  return GGML_STATUS_SUCCESS;
381
483
  }
382
- if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
383
- !g_ggml_sycl_disable_optimize) {
384
- ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
385
- tensor->extra = extra;
386
- ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
484
+
485
+ if (!g_ggml_sycl_disable_optimize) {
486
+ // set reorder extra buffer based on supported type
487
+ switch (tensor->type) {
488
+ case GGML_TYPE_Q4_0:
489
+ case GGML_TYPE_Q8_0:
490
+ case GGML_TYPE_Q4_K:
491
+ case GGML_TYPE_Q6_K:{
492
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
493
+ tensor->extra = extra;
494
+ ctx->tensor_extras.push_back(extra);
495
+ break;
496
+ }
497
+ default:
498
+ break;
499
+ }
387
500
  }
388
501
 
389
502
  if (ggml_is_quantized(tensor->type)) {
@@ -455,8 +568,43 @@ catch (sycl::exception const &exc) {
455
568
  std::exit(1);
456
569
  }
457
570
 
571
+ #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
572
+ static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) {
573
+ if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
574
+ return false;
575
+ }
576
+
577
+ ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
578
+ ze_device_properties_t props = {};
579
+ props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
580
+ ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
581
+ return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
582
+ }
583
+ #endif
584
+
458
585
  static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
459
586
  const void *ptr_src, size_t size) {
587
+ #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
588
+ // Use Level Zero direct copy for dGPU-to-dGPU transfers.
589
+ const bool l0_copy_supported =
590
+ ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src);
591
+ if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
592
+ auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
593
+ auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
594
+ ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
595
+ 0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
596
+ ze_command_list_handle_t cl;
597
+ ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
598
+ if (r == ZE_RESULT_SUCCESS) {
599
+ r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
600
+ zeCommandListDestroy(cl);
601
+ if (r == ZE_RESULT_SUCCESS) {
602
+ return;
603
+ }
604
+ }
605
+ }
606
+ #endif
607
+ // Host-staged copy
460
608
  char *host_buf = (char *)malloc(size);
461
609
  q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
462
610
  q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();
@@ -537,9 +685,15 @@ static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
537
685
  SYCL_CHECK(
538
686
  CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
539
687
 
540
- SYCL_CHECK(CHECK_TRY_ERROR((*stream)
541
- .memset(ctx->dev_ptr, value, buffer->size)
542
- .wait()));
688
+ constexpr size_t MAX_CHUNK = 2ULL << 30; // 2 GiB
689
+ for (size_t off = 0; off < buffer->size; off += MAX_CHUNK) {
690
+ size_t chunk = std::min(buffer->size - off, MAX_CHUNK);
691
+ SYCL_CHECK(CHECK_TRY_ERROR(
692
+ (*stream)
693
+ .memset(static_cast<char*>(ctx->dev_ptr) + off, value, chunk)
694
+ .wait()
695
+ ));
696
+ }
543
697
  }
544
698
  catch (sycl::exception const &exc) {
545
699
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -589,6 +743,8 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
589
743
  /* .memset_tensor = */ ggml_backend_sycl_buffer_memset_tensor,
590
744
  /* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor,
591
745
  /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor,
746
+ /* .set_tensor_2d = */ NULL,
747
+ /* .get_tensor_2d = */ NULL,
592
748
  /* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor,
593
749
  /* .clear = */ ggml_backend_sycl_buffer_clear,
594
750
  /* .reset = */ ggml_backend_sycl_buffer_reset,
@@ -618,8 +774,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
618
774
  size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
619
775
 
620
776
  void * dev_ptr;
621
- SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
622
- size, *stream)));
777
+ SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
623
778
  if (!dev_ptr) {
624
779
  GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
625
780
  return nullptr;
@@ -634,7 +789,7 @@ catch (sycl::exception const &exc) {
634
789
  }
635
790
 
636
791
  static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
637
- return 128;
792
+ return SYCL_BUFFER_ALIGNMENT;
638
793
  GGML_UNUSED(buft);
639
794
  }
640
795
 
@@ -860,18 +1015,10 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
860
1015
  size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
861
1016
  }
862
1017
 
863
- // FIXME: do not crash if SYCL Buffer alloc fails
864
- // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
865
1018
  ggml_sycl_set_device(i);
866
1019
  const queue_ptr stream = ctx->streams[i];
867
1020
  char * buf;
868
- /*
869
- DPCT1009:208: SYCL uses exceptions to report errors and does not use the
870
- error codes. The original code was commented out and a warning string
871
- was inserted. You need to rewrite this code.
872
- */
873
- SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
874
- size, *stream)));
1021
+ SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)ggml_sycl_malloc_device(size, *stream)));
875
1022
  if (!buf) {
876
1023
  char err_buf[1024];
877
1024
  snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
@@ -1035,6 +1182,8 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
1035
1182
  /* .memset_tensor = */ NULL,
1036
1183
  /* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor,
1037
1184
  /* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor,
1185
+ /* .set_tensor_2d = */ NULL,
1186
+ /* .get_tensor_2d = */ NULL,
1038
1187
  /* .cpy_tensor = */ NULL,
1039
1188
  /* .clear = */ ggml_backend_sycl_split_buffer_clear,
1040
1189
  /* .reset = */ NULL,
@@ -1063,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg
1063
1212
  }
1064
1213
 
1065
1214
  static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1066
- return 128;
1215
+ return SYCL_BUFFER_ALIGNMENT;
1067
1216
  GGML_UNUSED(buft);
1068
1217
  }
1069
1218
 
@@ -1157,13 +1306,28 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
1157
1306
  GGML_UNUSED(buft);
1158
1307
  }
1159
1308
 
1309
+ inline void * aligned_malloc_host(size_t alignment, size_t size) {
1310
+ #ifdef _WIN32
1311
+ return _aligned_malloc(size, alignment);
1312
+ #else
1313
+ return aligned_alloc(alignment, size);
1314
+ #endif
1315
+ }
1316
+
1317
+ inline void free_aligned_mem_host(void * memblock) {
1318
+ #ifdef _WIN32
1319
+ _aligned_free(memblock);
1320
+ #else
1321
+ free(memblock);
1322
+ #endif
1323
+ }
1324
+
1160
1325
  static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1161
- ggml_sycl_host_free(buffer->context);
1326
+ free_aligned_mem_host((void *)buffer->context);
1162
1327
  }
1163
1328
 
1164
1329
  static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1165
- void * ptr = ggml_sycl_host_malloc(size);
1166
-
1330
+ void * ptr = aligned_malloc_host(TENSOR_ALIGNMENT, size);
1167
1331
  if (ptr == nullptr) {
1168
1332
  // fallback to cpu buffer
1169
1333
  return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
@@ -1212,16 +1376,53 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
1212
1376
  explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
1213
1377
 
1214
1378
  ~ggml_sycl_pool_leg() {
1379
+ #ifdef DEBUG_SYCL_POOL
1380
+ int n_cached = 0;
1381
+ size_t bytes_cached = 0;
1382
+ for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
1383
+ if (buffer_pool[i].ptr != nullptr) {
1384
+ ++n_cached;
1385
+ bytes_cached += buffer_pool[i].size;
1386
+ }
1387
+ }
1388
+ GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
1389
+ n_cached, bytes_cached / 1024.0 / 1024.0);
1390
+ const auto slots = format_slots_in_alloc_order();
1391
+ if (!slots.empty()) {
1392
+ GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
1393
+ }
1394
+ #endif
1395
+
1215
1396
  for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
1216
1397
  ggml_sycl_buffer & b = buffer_pool[i];
1217
1398
  if (b.ptr != nullptr) {
1218
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
1399
+ SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(b.ptr, *qptr)));
1219
1400
  pool_size -= b.size;
1220
1401
  }
1221
1402
  }
1222
1403
  GGML_ASSERT(pool_size == 0);
1223
1404
  }
1224
1405
 
1406
+ #ifdef DEBUG_SYCL_POOL
1407
+ std::string format_slots_in_alloc_order() const {
1408
+ std::string line;
1409
+ char buf[32];
1410
+ bool first = true;
1411
+ for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
1412
+ if (buffer_pool[i].ptr == nullptr) {
1413
+ continue;
1414
+ }
1415
+ if (!first) {
1416
+ line += '/';
1417
+ }
1418
+ first = false;
1419
+ snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
1420
+ line += buf;
1421
+ }
1422
+ return line;
1423
+ }
1424
+ #endif
1425
+
1225
1426
  void * alloc(size_t size, size_t * actual_size) override {
1226
1427
  #ifdef DEBUG_sycl_MALLOC
1227
1428
  int nnz = 0;
@@ -1263,9 +1464,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
1263
1464
  void * ptr;
1264
1465
  size_t look_ahead_size = (size_t) (1.05 * size);
1265
1466
 
1266
- SYCL_CHECK(
1267
- CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
1268
- look_ahead_size, *qptr)));
1467
+ SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *)ggml_sycl_malloc_device(look_ahead_size, *qptr)));
1269
1468
  if (!ptr) {
1270
1469
  GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
1271
1470
  return nullptr;
@@ -1293,11 +1492,126 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
1293
1492
  }
1294
1493
  }
1295
1494
  GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
1296
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
1495
+ SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(ptr, *qptr)));
1297
1496
  pool_size -= size;
1298
1497
  }
1299
1498
  };
1300
1499
 
1500
+ // pool with virtual memory management
1501
+ #if defined(GGML_SYCL_USE_VMM)
1502
+ struct ggml_sycl_pool_vmm : public ggml_sycl_pool {
1503
+ static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
1504
+
1505
+ int device;
1506
+ sycl::context ctx;
1507
+ sycl::device dev;
1508
+
1509
+ uintptr_t pool_addr = 0;
1510
+ size_t pool_used = 0;
1511
+ size_t pool_size = 0;
1512
+ size_t granularity;
1513
+
1514
+ // physical_mem owns the commits (unlike cuMemMap)
1515
+ struct mapping {
1516
+ sycl::ext::oneapi::experimental::physical_mem phys;
1517
+ void * map_ptr;
1518
+ };
1519
+ std::vector<mapping> mappings;
1520
+
1521
+ explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) :
1522
+ device(device_),
1523
+ ctx(qptr_->get_context()),
1524
+ dev(qptr_->get_device()),
1525
+ granularity(ggml_sycl_info().devices[device_].vmm_granularity) {
1526
+ }
1527
+
1528
+ ~ggml_sycl_pool_vmm() {
1529
+ if (pool_addr == 0) {
1530
+ return;
1531
+ }
1532
+
1533
+ // Per spec, unmap must (a) match the exact (ptr, size) of an earlier
1534
+ // physical_mem::map() call and (b) precede destruction of the
1535
+ // physical_mem objects (their dtors won't unmap).
1536
+ for (auto & m : mappings) {
1537
+ SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap(
1538
+ m.map_ptr, m.phys.size(), ctx)));
1539
+ }
1540
+ SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem(
1541
+ pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx)));
1542
+ }
1543
+
1544
+ void * alloc(size_t size, size_t * actual_size) override {
1545
+ // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
1546
+ size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT);
1547
+
1548
+ size_t avail = pool_size - pool_used;
1549
+
1550
+ if (size > avail) {
1551
+ // round up to the next multiple of the granularity
1552
+ size_t reserve_size = GGML_PAD(size - avail, granularity);
1553
+
1554
+ GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE);
1555
+
1556
+ // allocate more physical memory
1557
+ std::optional<sycl::ext::oneapi::experimental::physical_mem> phys;
1558
+ SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size)));
1559
+
1560
+ // reserve virtual address space (if not already reserved)
1561
+ if (pool_addr == 0) {
1562
+ SYCL_CHECK(CHECK_TRY_ERROR(
1563
+ pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem(
1564
+ SYCL_POOL_VMM_MAX_SIZE, ctx)));
1565
+ }
1566
+
1567
+ // map at the end of the pool
1568
+ void * map_ptr = nullptr;
1569
+ SYCL_CHECK(CHECK_TRY_ERROR(
1570
+ map_ptr = phys->map(pool_addr + pool_size, reserve_size,
1571
+ sycl::ext::oneapi::experimental::address_access_mode::read_write)));
1572
+
1573
+ // stash these so we could unmap this exact range in dtor
1574
+ mappings.push_back({
1575
+ std::move(*phys),
1576
+ map_ptr,
1577
+ });
1578
+
1579
+ // add to the pool
1580
+ pool_size += reserve_size;
1581
+
1582
+ #ifdef DEBUG_SYCL_MALLOC
1583
+ GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
1584
+ device, (unsigned long long) (pool_size/1024/1024),
1585
+ (unsigned long long) (reserve_size/1024/1024));
1586
+ #endif
1587
+ }
1588
+
1589
+ GGML_ASSERT(pool_addr != 0);
1590
+
1591
+ void * ptr = reinterpret_cast<void *>(pool_addr + pool_used);
1592
+ *actual_size = size;
1593
+ pool_used += size;
1594
+
1595
+ #ifdef DEBUG_SYCL_MALLOC
1596
+ GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr);
1597
+ #endif
1598
+
1599
+ return ptr;
1600
+ }
1601
+
1602
+ void free(void * ptr, size_t size) override {
1603
+ #ifdef DEBUG_SYCL_MALLOC
1604
+ GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr);
1605
+ #endif
1606
+
1607
+ pool_used -= size;
1608
+
1609
+ // all deallocations must be in reverse order of the allocations
1610
+ GGML_ASSERT(ptr == reinterpret_cast<void *>(pool_addr + pool_used));
1611
+ }
1612
+ };
1613
+ #endif // defined(GGML_SYCL_USE_VMM)
1614
+
1301
1615
  struct ggml_sycl_pool_host : public ggml_sycl_pool {
1302
1616
  queue_ptr qptr;
1303
1617
  int device;
@@ -1378,15 +1692,18 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(que
1378
1692
  }
1379
1693
 
1380
1694
  std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
1381
- // TBD: NO VMM support
1382
- // if (ggml_sycl_info().devices[device].vmm) {
1383
- // return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
1384
- // }
1385
- return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
1695
+ #if defined(GGML_SYCL_USE_VMM)
1696
+ if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) {
1697
+ return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(qptr, device));
1698
+ }
1699
+ #endif // defined(GGML_SYCL_USE_VMM)
1700
+ return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
1386
1701
  }
1387
1702
 
1388
- // TBD pool with virtual memory management
1389
- // struct ggml_sycl_pool_vmm : public ggml_sycl_pool
1703
+
1704
+ std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
1705
+ return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
1706
+ }
1390
1707
 
1391
1708
  /// kernels
1392
1709
  typedef void (*ggml_sycl_op_mul_mat_t)(
@@ -1825,6 +2142,110 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
1825
2142
  }
1826
2143
  }
1827
2144
 
2145
+ static void top_k_f32_sycl(
2146
+ const float * src,
2147
+ int32_t * dst_indices,
2148
+ const int64_t ncols,
2149
+ const int64_t nrows,
2150
+ const int k,
2151
+ dpct::queue_ptr main_stream
2152
+ ) {
2153
+ const int block_size = 128;
2154
+
2155
+ const sycl::range<1> block_dims(block_size);
2156
+ const sycl::range<1> grid_dims(nrows);
2157
+
2158
+ main_stream->submit([&](sycl::handler &cgh) {
2159
+ sycl::local_accessor<float, 1> shared_vals(sycl::range<1>(block_size * k), cgh);
2160
+ sycl::local_accessor<int, 1> shared_idx(sycl::range<1>(block_size * k), cgh);
2161
+
2162
+ cgh.parallel_for(
2163
+ sycl::nd_range<1>(grid_dims * block_dims, block_dims),
2164
+ [=](sycl::nd_item<1> item_ct1) {
2165
+ const int row = item_ct1.get_group(0);
2166
+ const int tid = item_ct1.get_local_id(0);
2167
+
2168
+ if (row >= nrows) return;
2169
+
2170
+ const float * src_row = src + row * ncols;
2171
+ int32_t * dst_idx_row = dst_indices + row * k;
2172
+
2173
+ float local_vals[32];
2174
+ int local_idx[32];
2175
+
2176
+ for (int i = 0; i < k; i++) {
2177
+ local_vals[i] = -FLT_MAX;
2178
+ local_idx[i] = -1;
2179
+ }
2180
+
2181
+ for (int col = tid; col < ncols; col += block_size) {
2182
+ float val = src_row[col];
2183
+
2184
+ if (val > local_vals[k-1]) {
2185
+ int pos = k - 1;
2186
+ while (pos > 0 && val > local_vals[pos - 1]) {
2187
+ pos--;
2188
+ }
2189
+
2190
+ for (int i = k - 1; i > pos; i--) {
2191
+ local_vals[i] = local_vals[i - 1];
2192
+ local_idx[i] = local_idx[i - 1];
2193
+ }
2194
+ local_vals[pos] = val;
2195
+ local_idx[pos] = col;
2196
+ }
2197
+ }
2198
+
2199
+ for (int i = 0; i < k; i++) {
2200
+ shared_vals[tid * k + i] = local_vals[i];
2201
+ shared_idx[tid * k + i] = local_idx[i];
2202
+ }
2203
+ item_ct1.barrier(sycl::access::fence_space::local_space);
2204
+
2205
+ if (tid == 0) {
2206
+ float final_vals[32];
2207
+ int final_idx[32];
2208
+
2209
+ for (int i = 0; i < k; i++) {
2210
+ final_vals[i] = -FLT_MAX;
2211
+ final_idx[i] = -1;
2212
+ }
2213
+
2214
+ for (int t = 0; t < block_size; t++) {
2215
+ for (int i = 0; i < k; i++) {
2216
+ float val = shared_vals[t * k + i];
2217
+ int idx = shared_idx[t * k + i];
2218
+
2219
+ if (val > final_vals[k-1]) {
2220
+ int pos = k - 1;
2221
+ while (pos > 0 && val > final_vals[pos - 1]) {
2222
+ pos--;
2223
+ }
2224
+
2225
+ for (int j = k - 1; j > pos; j--) {
2226
+ final_vals[j] = final_vals[j - 1];
2227
+ final_idx[j] = final_idx[j - 1];
2228
+ }
2229
+ final_vals[pos] = val;
2230
+ final_idx[pos] = idx;
2231
+ }
2232
+ }
2233
+ }
2234
+
2235
+ for (int i = 0; i < k; i++) {
2236
+ dst_idx_row[i] = final_idx[i];
2237
+ }
2238
+
2239
+ if (k > 1) {
2240
+ int32_t temp = dst_idx_row[0];
2241
+ dst_idx_row[0] = dst_idx_row[1];
2242
+ dst_idx_row[1] = temp;
2243
+ }
2244
+ }
2245
+ });
2246
+ });
2247
+ }
2248
+
1828
2249
  static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
1829
2250
  const int nrows, queue_ptr stream) {
1830
2251
  const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE);
@@ -2004,6 +2425,31 @@ inline void ggml_sycl_op_mul_mat_sycl(
2004
2425
  #else
2005
2426
  bool use_fp16 = false;
2006
2427
  #endif
2428
+
2429
+ #if GGML_SYCL_DNNL && defined(GGML_SYCL_HAS_BF16)
2430
+ // Fast path for bf16 src0
2431
+ if (src0->type == GGML_TYPE_BF16 && !g_ggml_sycl_disable_dnn && ggml_is_contiguous(src0) &&
2432
+ row_diff == src0->ne[1]) {
2433
+ using bf16_t = sycl::ext::oneapi::bfloat16;
2434
+ ggml_sycl_pool_alloc<bf16_t> src1_as_bf16(ctx.pool(), src1_ncols*ne10);
2435
+ if (src1->type != GGML_TYPE_BF16) {
2436
+ const to_bf16_sycl_t to_bf16_sycl = ggml_get_to_bf16_sycl(src1->type, dst);
2437
+ GGML_ASSERT(to_bf16_sycl != nullptr);
2438
+ to_bf16_sycl(src1_ddf_i, src1_as_bf16.get(), src1_ncols*ne10, stream);
2439
+ } else {
2440
+ stream->memcpy(src1_as_bf16.get(), src1_ddf_i, src1_ncols*ne10*sizeof(bf16_t));
2441
+ }
2442
+ DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10,
2443
+ src0_dd_i, DnnlGemmWrapper::to_dt<bf16_t>(),
2444
+ src1_as_bf16.get(), DnnlGemmWrapper::to_dt<bf16_t>(),
2445
+ dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2446
+ GGML_UNUSED(dst);
2447
+ GGML_UNUSED(src1_ddq_i);
2448
+ GGML_UNUSED(src1_padded_row_size);
2449
+ return;
2450
+ }
2451
+ #endif
2452
+
2007
2453
  if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
2008
2454
  row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
2009
2455
  ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
@@ -2048,8 +2494,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
2048
2494
  const sycl::half alpha_f16 = 1.0f;
2049
2495
  const sycl::half beta_f16 = 0.0f;
2050
2496
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
2051
- *stream, oneapi::math::transpose::trans,
2052
- oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
2497
+ *stream, oneapi::mkl::transpose::trans,
2498
+ oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
2053
2499
  &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
2054
2500
  src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
2055
2501
  dst_f16.get(), dpct::library_data_t::real_half, ldc,
@@ -2081,21 +2527,25 @@ inline void ggml_sycl_op_mul_mat_sycl(
2081
2527
  const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
2082
2528
  const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
2083
2529
 
2530
+ {
2531
+ const int64_t gemm_flops = (int64_t)row_diff * src1_ncols * ne10;
2532
+ const bool use_mkl_direct = gemm_flops < 256 * 256 * 256;
2084
2533
  #if GGML_SYCL_DNNL
2085
- if (!g_ggml_sycl_disable_dnn) {
2086
- DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
2087
- DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
2088
- dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2089
- }
2090
- else
2534
+ if (!g_ggml_sycl_disable_dnn && !use_mkl_direct) {
2535
+ DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
2536
+ DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
2537
+ dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2538
+ }
2539
+ else
2091
2540
  #endif
2092
- {
2093
- const float alpha = 1.0f;
2094
- const float beta = 0.0f;
2095
- SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
2096
- get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
2097
- src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
2098
- dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
2541
+ {
2542
+ const float alpha = 1.0f;
2543
+ const float beta = 0.0f;
2544
+ SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
2545
+ *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
2546
+ src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
2547
+ dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
2548
+ }
2099
2549
  }
2100
2550
  }
2101
2551
  GGML_UNUSED(dst);
@@ -2216,6 +2666,30 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
2216
2666
  main_stream, ctx.device);
2217
2667
  }
2218
2668
 
2669
+ static void ggml_sycl_op_top_k(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2670
+ const ggml_tensor * src0 = dst->src[0];
2671
+
2672
+ GGML_ASSERT(src0);
2673
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
2674
+ GGML_ASSERT(dst->type == GGML_TYPE_I32);
2675
+ GGML_ASSERT(ggml_is_contiguous(src0));
2676
+
2677
+ dpct::queue_ptr main_stream = ctx.stream();
2678
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
2679
+
2680
+ const float * src0_dd = static_cast<const float *>(src0->data);
2681
+ int32_t * dst_dd = static_cast<int32_t *>(dst->data);
2682
+
2683
+ const int k = dst->ne[0];
2684
+ const int64_t ncols = src0->ne[0];
2685
+ const int64_t nrows = ggml_nrows(src0);
2686
+
2687
+ GGML_ASSERT(k > 0 && k <= 32);
2688
+ GGML_ASSERT(k <= ncols);
2689
+
2690
+ top_k_f32_sycl(src0_dd, dst_dd, ncols, nrows, k, main_stream);
2691
+ }
2692
+
2219
2693
  inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2220
2694
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2221
2695
  GGML_ASSERT( dst->type == GGML_TYPE_I32);
@@ -2248,6 +2722,65 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten
2248
2722
  diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
2249
2723
  }
2250
2724
 
2725
+ static void tri_f32_sycl(
2726
+ const float * src,
2727
+ float * dst,
2728
+ const int64_t ne0,
2729
+ const int64_t ne1,
2730
+ const int64_t ne2,
2731
+ const int64_t ne3,
2732
+ const ggml_tri_type ttype,
2733
+ dpct::queue_ptr main_stream
2734
+ ) {
2735
+ const size_t total = (size_t) ne0 * (size_t) ne1 * (size_t) ne2 * (size_t) ne3;
2736
+
2737
+ main_stream->parallel_for(sycl::range<1>(total), [=](sycl::id<1> tid) {
2738
+ const int64_t idx = (int64_t) tid[0];
2739
+
2740
+ const int64_t i0 = idx % ne0;
2741
+ const int64_t t1 = idx / ne0;
2742
+ const int64_t i1 = t1 % ne1;
2743
+
2744
+ bool keep = false;
2745
+ switch (ttype) {
2746
+ case GGML_TRI_TYPE_LOWER: keep = (i0 < i1); break;
2747
+ case GGML_TRI_TYPE_LOWER_DIAG: keep = (i0 <= i1); break;
2748
+ case GGML_TRI_TYPE_UPPER: keep = (i0 > i1); break;
2749
+ case GGML_TRI_TYPE_UPPER_DIAG: keep = (i0 >= i1); break;
2750
+ default: keep = false; break;
2751
+ }
2752
+
2753
+ dst[idx] = keep ? src[idx] : 0.0f;
2754
+ });
2755
+ }
2756
+
2757
+ static void ggml_sycl_op_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2758
+ const ggml_tensor * src0 = dst->src[0];
2759
+ GGML_ASSERT(src0);
2760
+
2761
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
2762
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
2763
+ GGML_ASSERT(ggml_is_contiguous(src0));
2764
+ GGML_ASSERT(ggml_is_contiguous(dst));
2765
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
2766
+
2767
+ dpct::queue_ptr main_stream = ctx.stream();
2768
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
2769
+
2770
+ const float * src0_dd = static_cast<const float *>(src0->data);
2771
+ float * dst_dd = static_cast<float *>(dst->data);
2772
+
2773
+ const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
2774
+
2775
+ const int64_t ne0 = src0->ne[0];
2776
+ const int64_t ne1 = src0->ne[1];
2777
+ const int64_t ne2 = src0->ne[2];
2778
+ const int64_t ne3 = src0->ne[3];
2779
+
2780
+ tri_f32_sycl(src0_dd, dst_dd, ne0, ne1, ne2, ne3, ttype, main_stream);
2781
+ }
2782
+
2783
+
2251
2784
  inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2252
2785
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2253
2786
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -2810,7 +3343,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2810
3343
 
2811
3344
  }
2812
3345
  #if GGML_SYCL_DNNL
2813
- // oneDNN handles strided data and does not need overhead of get_to_fp16_nc_sycl
3346
+ // oneDNN handles strided data and does not need overhead of ggml_get_to_fp16_nc_sycl
2814
3347
  const int64_t ne_src1 = src1->nb[last_str] * src1->ne[last_dim] / type_size_src1;
2815
3348
  src1_f16_alloc.alloc(ne_src1);
2816
3349
  const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
@@ -2819,7 +3352,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2819
3352
  # else
2820
3353
  const int64_t ne_src1 = ggml_nelements(src1);
2821
3354
  src1_f16_alloc.alloc(ne_src1);
2822
- const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
3355
+ const to_fp16_nc_sycl_t to_fp16_nc_sycl = ggml_get_to_fp16_nc_sycl(src1->type);
2823
3356
  GGML_ASSERT(to_fp16_nc_sycl != nullptr);
2824
3357
  to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue);
2825
3358
  #endif
@@ -2963,8 +3496,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2963
3496
  const int64_t smb = ne12 == 1 ? s13 : s12;
2964
3497
 
2965
3498
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
2966
- SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
2967
- oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
3499
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::mkl::transpose::trans,
3500
+ oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
2968
3501
  src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
2969
3502
  src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
2970
3503
  mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
@@ -2988,7 +3521,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2988
3521
  });
2989
3522
 
2990
3523
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
2991
- *queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
3524
+ *queue, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
2992
3525
  (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
2993
3526
  (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
2994
3527
  (void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
@@ -3014,8 +3547,11 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
3014
3547
  inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
3015
3548
  switch (type) {
3016
3549
  case GGML_TYPE_Q4_0:
3550
+ case GGML_TYPE_Q8_0:
3017
3551
  return true;
3552
+ case GGML_TYPE_Q3_K:
3018
3553
  case GGML_TYPE_Q4_K:
3554
+ case GGML_TYPE_Q5_K:
3019
3555
  case GGML_TYPE_Q6_K:
3020
3556
  return !g_ggml_sycl_prioritize_dmmv;
3021
3557
  default:
@@ -3026,6 +3562,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
3026
3562
  inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
3027
3563
  switch (type) {
3028
3564
  case GGML_TYPE_Q4_0:
3565
+ case GGML_TYPE_Q8_0:
3029
3566
  return true;
3030
3567
  default:
3031
3568
  return false;
@@ -3035,7 +3572,10 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
3035
3572
  inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
3036
3573
  switch (type) {
3037
3574
  case GGML_TYPE_Q4_0:
3575
+ case GGML_TYPE_Q8_0:
3576
+ case GGML_TYPE_Q3_K:
3038
3577
  case GGML_TYPE_Q4_K:
3578
+ case GGML_TYPE_Q5_K:
3039
3579
  case GGML_TYPE_Q6_K:
3040
3580
  return true;
3041
3581
  default:
@@ -3056,6 +3596,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
3056
3596
  case GGML_TYPE_Q5_K:
3057
3597
  case GGML_TYPE_Q6_K:
3058
3598
  case GGML_TYPE_F16:
3599
+ case GGML_TYPE_BF16:
3059
3600
  return true;
3060
3601
  default:
3061
3602
  return false;
@@ -3073,7 +3614,7 @@ static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size)
3073
3614
  // If async allocation extension is not available, use_async should always be false.
3074
3615
  GGML_ASSERT(!use_async);
3075
3616
  #endif
3076
- return sycl::malloc(size, *stream, sycl::usm::alloc::device);
3617
+ return ggml_sycl_malloc_device(size, *stream);
3077
3618
  }
3078
3619
 
3079
3620
  static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
@@ -3087,12 +3628,58 @@ static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
3087
3628
  // If async allocation extension is not available, use_async should always be false.
3088
3629
  GGML_ASSERT(!use_async);
3089
3630
  #endif
3090
- sycl::free(ptr, *stream);
3631
+ ggml_sycl_free_device(ptr, *stream);
3091
3632
  }
3092
3633
 
3093
- static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
3634
+ // RAII wrapper for temporary reorder buffers with optional host memory fallback.
3635
+ // When device allocation fails and GGML_SYCL_HOST_MEM_FALLBACK is enabled,
3636
+ // falls back to host memory so the reorder kernel can still run (over PCIe).
3637
+ // Device access to host memory requires Linux kernel 6.8+ (Ubuntu 26.04+).
3638
+ struct sycl_reorder_temp_buffer {
3639
+ void * ptr = nullptr;
3640
+ dpct::queue_ptr stream;
3641
+
3642
+ sycl_reorder_temp_buffer(dpct::queue_ptr stream, size_t size) : stream(stream) {
3643
+ ptr = sycl_ext_malloc_device(stream, size);
3644
+ #ifdef GGML_SYCL_HOST_MEM_FALLBACK
3645
+ if (!ptr) {
3646
+ ptr = sycl::malloc_host(size, *stream);
3647
+ if (ptr) {
3648
+ host_fallback = true;
3649
+ GGML_LOG_WARN("%s: device alloc of %zu bytes failed, using host memory fallback\n", __func__, size);
3650
+ }
3651
+ }
3652
+ #endif
3653
+ }
3654
+
3655
+ ~sycl_reorder_temp_buffer() {
3656
+ if (!ptr) {
3657
+ return;
3658
+ }
3659
+ if (host_fallback) {
3660
+ sycl::free(ptr, *stream);
3661
+ } else {
3662
+ sycl_ext_free(stream, ptr);
3663
+ }
3664
+ }
3665
+
3666
+ explicit operator bool() const { return ptr != nullptr; }
3667
+
3668
+ sycl_reorder_temp_buffer(const sycl_reorder_temp_buffer &) = delete;
3669
+ sycl_reorder_temp_buffer & operator=(const sycl_reorder_temp_buffer &) = delete;
3670
+
3671
+ private:
3672
+ bool host_fallback = false;
3673
+ };
3674
+
3675
+ static bool reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
3094
3676
  dpct::queue_ptr stream) {
3095
- uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
3677
+ sycl_reorder_temp_buffer tmp(stream, size);
3678
+ if (!tmp) {
3679
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3680
+ return false;
3681
+ }
3682
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3096
3683
 
3097
3684
  sycl::event copy_event;
3098
3685
  SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
@@ -3121,16 +3708,60 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
3121
3708
  if (!g_ggml_sycl_use_async_mem_op) {
3122
3709
  reorder_event.wait_and_throw();
3123
3710
  }
3124
- sycl_ext_free(stream, tmp_buf);
3711
+ return true;
3125
3712
  }
3126
3713
 
3127
- static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3714
+ static bool reorder_qw_q8_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
3715
+ dpct::queue_ptr stream) {
3716
+ sycl_reorder_temp_buffer tmp(stream, size);
3717
+ if (!tmp) {
3718
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3719
+ return false;
3720
+ }
3721
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3722
+
3723
+ sycl::event copy_event;
3724
+ SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
3725
+ if (!g_ggml_sycl_use_async_mem_op) {
3726
+ copy_event.wait();
3727
+ }
3728
+
3729
+ GGML_ASSERT((size % sizeof(block_q8_0) == 0));
3730
+ GGML_ASSERT((offset % sizeof(block_q8_0) == 0));
3731
+ int offset_blks = offset / sizeof(block_q8_0);
3732
+ auto qs_ptr = data_device + offset_blks * QK8_0;
3733
+ auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows) + offset_blks;
3734
+
3735
+ auto reorder_event = stream->parallel_for(
3736
+ size / sizeof(block_q8_0),
3737
+ [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
3738
+ const block_q8_0* x = (const block_q8_0*)tmp_buf;
3739
+ const int ib = i;
3740
+
3741
+ for (int j = 0; j < QK8_0; j++)
3742
+ {
3743
+ *((int8_t*)qs_ptr + ib * QK8_0 + j) = x[ib].qs[j];
3744
+ }
3745
+ *(d_ptr + ib) = x[ib].d;
3746
+ });
3747
+ if (!g_ggml_sycl_use_async_mem_op) {
3748
+ reorder_event.wait_and_throw();
3749
+ }
3750
+ return true;
3751
+ }
3752
+
3753
+ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3128
3754
  GGML_ASSERT(size % sizeof(block_q4_K) == 0);
3129
3755
  GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
3130
3756
 
3131
3757
  const int nblocks = size / sizeof(block_q4_K);
3132
3758
 
3133
- uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
3759
+ sycl_reorder_temp_buffer tmp(stream, size);
3760
+ if (!tmp) {
3761
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3762
+ return false;
3763
+ }
3764
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3134
3765
 
3135
3766
  sycl::event copy_event;
3136
3767
  SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
@@ -3159,16 +3790,117 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
3159
3790
  if (!g_ggml_sycl_use_async_mem_op) {
3160
3791
  reorder_event.wait_and_throw();
3161
3792
  }
3162
- sycl_ext_free(stream, tmp_buf);
3793
+ return true;
3794
+ }
3795
+
3796
+ static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3797
+ GGML_ASSERT(size % sizeof(block_q3_K) == 0);
3798
+ GGML_ASSERT(offset % sizeof(block_q3_K) == 0);
3799
+
3800
+ const int nblocks = size / sizeof(block_q3_K);
3801
+
3802
+ sycl_reorder_temp_buffer tmp(stream, size);
3803
+ if (!tmp) {
3804
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3805
+ return false;
3806
+ }
3807
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3808
+
3809
+ sycl::event copy_event;
3810
+ SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
3811
+ if (!g_ggml_sycl_use_async_mem_op) {
3812
+ copy_event.wait();
3813
+ }
3814
+
3815
+ auto * qs_ptr = data_device;
3816
+ auto * hmask_ptr = qs_ptr + (QK_K / 4) * nblocks;
3817
+ auto * scales_ptr = hmask_ptr + (QK_K / 8) * nblocks;
3818
+ sycl::half * d_ptr = (sycl::half *) (scales_ptr + 12 * nblocks);
3819
+
3820
+ auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
3821
+ const block_q3_K * x = (const block_q3_K *) tmp_buf;
3822
+ const int ib = i;
3823
+
3824
+ for (int j = 0; j < QK_K / 4; ++j) {
3825
+ qs_ptr[ib * (QK_K / 4) + j] = x[ib].qs[j];
3826
+ }
3827
+
3828
+ for (int j = 0; j < QK_K / 8; ++j) {
3829
+ hmask_ptr[ib * (QK_K / 8) + j] = x[ib].hmask[j];
3830
+ }
3831
+
3832
+ for (int j = 0; j < 12; ++j) {
3833
+ scales_ptr[ib * 12 + j] = x[ib].scales[j];
3834
+ }
3835
+
3836
+ d_ptr[ib] = x[ib].d;
3837
+ });
3838
+ if (!g_ggml_sycl_use_async_mem_op) {
3839
+ reorder_event.wait_and_throw();
3840
+ }
3841
+ return true;
3842
+ }
3843
+
3844
+ static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3845
+ GGML_ASSERT(size % sizeof(block_q5_K) == 0);
3846
+ GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
3847
+
3848
+ const int nblocks = size / sizeof(block_q5_K);
3849
+
3850
+ sycl_reorder_temp_buffer tmp(stream, size);
3851
+ if (!tmp) {
3852
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3853
+ return false;
3854
+ }
3855
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3856
+
3857
+ sycl::event copy_event;
3858
+ SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
3859
+ if (!g_ggml_sycl_use_async_mem_op) {
3860
+ copy_event.wait();
3861
+ }
3862
+
3863
+ auto * qs_ptr = data_device;
3864
+ auto * qh_ptr = qs_ptr + (QK_K / 2) * nblocks;
3865
+ auto * scales_ptr = qh_ptr + (QK_K / 8) * nblocks;
3866
+ auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
3867
+
3868
+ auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
3869
+ const block_q5_K * x = (const block_q5_K *) tmp_buf;
3870
+ const int ib = i;
3871
+
3872
+ for (int j = 0; j < QK_K / 2; ++j) {
3873
+ qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
3874
+ }
3875
+
3876
+ for (int j = 0; j < QK_K / 8; ++j) {
3877
+ qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
3878
+ }
3879
+
3880
+ for (int j = 0; j < K_SCALE_SIZE; ++j) {
3881
+ scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
3882
+ }
3883
+
3884
+ dm_ptr[ib] = x[ib].dm;
3885
+ });
3886
+ if (!g_ggml_sycl_use_async_mem_op) {
3887
+ reorder_event.wait_and_throw();
3888
+ }
3889
+ return true;
3163
3890
  }
3164
3891
 
3165
- static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3892
+ static bool reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3166
3893
  GGML_ASSERT(size % sizeof(block_q6_K) == 0);
3167
3894
  GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
3168
3895
 
3169
3896
  const int nblocks = size / sizeof(block_q6_K);
3170
3897
 
3171
- uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
3898
+ sycl_reorder_temp_buffer tmp(stream, size);
3899
+ if (!tmp) {
3900
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3901
+ return false;
3902
+ }
3903
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3172
3904
 
3173
3905
  sycl::event copy_event;
3174
3906
  SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
@@ -3207,10 +3939,10 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
3207
3939
  if (!g_ggml_sycl_use_async_mem_op) {
3208
3940
  reorder_event.wait_and_throw();
3209
3941
  }
3210
- sycl_ext_free(stream, tmp_buf);
3942
+ return true;
3211
3943
  }
3212
3944
 
3213
- static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
3945
+ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
3214
3946
  uint8_t * data_device = (uint8_t *) src0->data;
3215
3947
  size_t ncols = src0->ne[0];
3216
3948
  size_t nrows = src0->ne[1];
@@ -3218,17 +3950,20 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
3218
3950
 
3219
3951
  switch (src0->type) {
3220
3952
  case GGML_TYPE_Q4_0:
3221
- reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
3222
- break;
3953
+ return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
3954
+ case GGML_TYPE_Q8_0:
3955
+ return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
3956
+ case GGML_TYPE_Q3_K:
3957
+ return reorder_qw_q3_k(data_device, size, 0, stream);
3223
3958
  case GGML_TYPE_Q4_K:
3224
- reorder_qw_q4_k(data_device, size, 0, stream);
3225
- break;
3959
+ return reorder_qw_q4_k(data_device, size, 0, stream);
3960
+ case GGML_TYPE_Q5_K:
3961
+ return reorder_qw_q5_k(data_device, size, 0, stream);
3226
3962
  case GGML_TYPE_Q6_K:
3227
- reorder_qw_q6_k(data_device, size, 0, stream);
3228
- break;
3963
+ return reorder_qw_q6_k(data_device, size, 0, stream);
3229
3964
  default:
3230
3965
  GGML_ABORT("reorder_qw() called with unsupported type");
3231
- break;
3966
+ return false;
3232
3967
  }
3233
3968
  }
3234
3969
 
@@ -3236,7 +3971,9 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten
3236
3971
  return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
3237
3972
  ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
3238
3973
  dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
3239
- dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
3974
+ // ne[1] <= 8 so multi-column decode (spec / MTP verify) also bootstraps the reorder;
3975
+ // all reorderable types have a _switch_ncols kernel.
3976
+ dst->src[1]->ne[1] <= 8 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
3240
3977
  }
3241
3978
 
3242
3979
  static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
@@ -3268,14 +4005,20 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
3268
4005
  break;
3269
4006
  }
3270
4007
 
3271
- reorder_qw(src0, ctx->stream());
3272
- extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
4008
+ if (reorder_qw(src0, ctx->stream())) {
4009
+ extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
4010
+ }
3273
4011
  }
3274
4012
 
3275
4013
 
3276
4014
  static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4015
+ // The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
4016
+ // a multiple of 2*DMMV_X. Quantized types use block-structured kernels that only
4017
+ // need ne[0] % DMMV_X == 0.
4018
+ const int64_t dmmv_x_required = (src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F16) ?
4019
+ 2*GGML_SYCL_DMMV_X : GGML_SYCL_DMMV_X;
3277
4020
  return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
3278
- src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
4021
+ src0->ne[0] % dmmv_x_required == 0 && src1->ne[1] == 1;
3279
4022
  }
3280
4023
 
3281
4024
  static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3316,19 +4059,25 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
3316
4059
 
3317
4060
 
3318
4061
  // mmvq and mmq need the __dp4a instruction which is available for gen12+
3319
- // Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
4062
+ // Workaround in https://github.com/ggml-org/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
3320
4063
  use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
3321
4064
  #ifdef SYCL_USE_XMX
3322
4065
  use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
3323
4066
  #endif // SYCL_USE_XMX
3324
4067
 
3325
- // mmvq path is faster in the CUDA backend.
3326
- if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
3327
- // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
3328
- // is enabled takes precedence over DMMV, the current if-else implementation
3329
- // requires disabling DMMV if both conditions are met
3330
- || (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) {
3331
- use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
4068
+ // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
4069
+ // is enabled takes precedence over DMMV, the current if-else implementation
4070
+ // requires disabling DMMV if both conditions are met
4071
+
4072
+ if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) &&
4073
+ ggml_sycl_supports_reorder_mmvq(src0->type)))) {
4074
+ // Arc770 get benefit with Q4_0 by skipping it.
4075
+ if (!(ggml_sycl_info().devices[ctx.device].hw_info.arch ==
4076
+ gpu_arch::intel_gpu_acm_g10 &&
4077
+ src0->type == GGML_TYPE_Q4_0)) {
4078
+ use_dequantize_mul_mat_vec =
4079
+ use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
4080
+ }
3332
4081
  }
3333
4082
 
3334
4083
  if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
@@ -3373,35 +4122,17 @@ struct mmid_row_mapping {
3373
4122
 
3374
4123
  __dpct_inline__ static void k_copy_src1_to_contiguous(
3375
4124
  const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
3376
- int *__restrict__ cur_src1_row, mmid_row_mapping *__restrict__ row_mapping,
3377
- const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
4125
+ const mmid_row_mapping *__restrict__ row_mapping,
3378
4126
  int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
3379
- const sycl::nd_item<3> &item_ct1, int &src1_row) {
3380
- int32_t iid1 = item_ct1.get_group(2);
3381
- int32_t id = item_ct1.get_group(1);
3382
-
3383
- const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
4127
+ const sycl::nd_item<3> &item_ct1) {
4128
+ const int32_t src1_row = item_ct1.get_group(2);
3384
4129
 
3385
- if (row_id_i != i02) {
3386
- return;
3387
- }
4130
+ const int32_t iid1 = row_mapping[src1_row].i2;
4131
+ const int32_t id = row_mapping[src1_row].i1;
3388
4132
 
3389
4133
  const int64_t i11 = id % ne11;
3390
4134
  const int64_t i12 = iid1;
3391
4135
 
3392
- if (item_ct1.get_local_id(2) == 0) {
3393
- src1_row =
3394
- dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
3395
- cur_src1_row, 1);
3396
- row_mapping[src1_row] = {id, iid1};
3397
- }
3398
- /*
3399
- DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
3400
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
3401
- performance if there is no access to global memory.
3402
- */
3403
- item_ct1.barrier();
3404
-
3405
4136
  const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
3406
4137
  float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
3407
4138
 
@@ -3431,6 +4162,92 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
3431
4162
  }
3432
4163
  }
3433
4164
 
4165
+ // Fused MoE TG fast path. Returns false to fall back to the per-expert loop below.
4166
+ static bool ggml_sycl_mul_mat_id_mmvq_fused(
4167
+ ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
4168
+ const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
4169
+ {
4170
+ const int64_t ne10 = src1->ne[0];
4171
+ const int64_t ne11 = src1->ne[1];
4172
+ const int64_t ne12 = src1->ne[2];
4173
+ if (ne12 != 1) return false;
4174
+ if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) return false;
4175
+ if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
4176
+ if (!ggml_is_contiguous(src1)) return false;
4177
+
4178
+ // Reorder layout not supported; fall back.
4179
+ const ggml_tensor_extra_gpu * src0_extra =
4180
+ static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
4181
+ if (src0_extra && src0_extra->optimized_feature.reorder) return false;
4182
+
4183
+ const int64_t n_ids_per_group = ids->ne[0];
4184
+ if (ids->ne[1] != 1) return false;
4185
+ if (ne11 != 1 && ne11 != n_ids_per_group) return false;
4186
+
4187
+ const queue_ptr stream = ctx.stream();
4188
+ const int src1_padded_cols = GGML_PAD((int) ne10, MATRIX_ROW_PADDING);
4189
+ const int n_experts_used = (int) n_ids_per_group;
4190
+ const int nrows = (int) src0->ne[1];
4191
+
4192
+ ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
4193
+ (size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
4194
+ char * src1_ddq = src1_q8_alloc.get();
4195
+ quantize_row_q8_1_sycl<quantize_q8_1>(
4196
+ (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
4197
+ src1_padded_cols, stream);
4198
+
4199
+ const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
4200
+ const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
4201
+
4202
+ return ggml_sycl_mul_mat_vec_q_id(
4203
+ src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
4204
+ (float *) dst->data, (int) ne10, nrows, n_experts_used,
4205
+ /*expert_weight_stride=*/ src0->nb[2],
4206
+ /*dst_row_stride=*/ dst->nb[1],
4207
+ src1_row_stride, stream);
4208
+ }
4209
+
4210
+ // counting sort of the routed rows by expert id (row_id_i, as chosen by the router):
4211
+ // builds a projection of a memory layout where each expert's slice is contiguous
4212
+ static void mmid_counting_sort_rows(
4213
+ const ggml_tensor * ids, const char * ids_host,
4214
+ int64_t n_ids, int64_t n_as, int64_t n_routed_rows,
4215
+ std::vector<int64_t> & expert_counts,
4216
+ std::vector<int64_t> & expert_row_offsets,
4217
+ std::vector<mmid_row_mapping> & routed_row_src) {
4218
+
4219
+ // frequencies: how many routed rows each expert "owns"
4220
+ expert_counts.assign(n_as, 0);
4221
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
4222
+ for (int64_t id = 0; id < n_ids; id++) {
4223
+ const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
4224
+ GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
4225
+ expert_counts[row_id_i]++;
4226
+ }
4227
+ }
4228
+
4229
+ // where each expert's slice starts (row indices) and the previous ends
4230
+ expert_row_offsets.assign(n_as + 1, 0);
4231
+ for (int64_t i02 = 0; i02 < n_as; i02++) {
4232
+ expert_row_offsets[i02 + 1] = expert_row_offsets[i02] + expert_counts[i02];
4233
+ }
4234
+
4235
+ std::vector<int64_t> expert_row_next = expert_row_offsets;
4236
+ routed_row_src.resize(n_routed_rows);
4237
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
4238
+ for (int64_t id = 0; id < n_ids; id++) {
4239
+ const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
4240
+ GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
4241
+
4242
+ // find and validate the next free row for a given expert (row_id_i)
4243
+ const int64_t routed_row = expert_row_next[row_id_i]++;
4244
+ GGML_ASSERT(routed_row >= expert_row_offsets[row_id_i]);
4245
+ GGML_ASSERT(routed_row < expert_row_offsets[row_id_i + 1]);
4246
+ routed_row_src[routed_row] = {(int32_t) id, (int32_t) iid1};
4247
+ }
4248
+ }
4249
+ }
4250
+
3434
4251
  static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3435
4252
  ggml_tensor *dst) try {
3436
4253
  scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
@@ -3446,6 +4263,12 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3446
4263
  const int64_t n_as = ne02;
3447
4264
  const int64_t n_ids = ids->ne[0];
3448
4265
 
4266
+ if (ne12 == 1) {
4267
+ if (ggml_sycl_mul_mat_id_mmvq_fused(ctx, src0, src1, ids, dst)) {
4268
+ return;
4269
+ }
4270
+ }
4271
+
3449
4272
  std::vector<char> ids_host(ggml_nbytes(ids));
3450
4273
  const char * ids_dev = (const char *) ids->data;
3451
4274
 
@@ -3496,105 +4319,98 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3496
4319
  }
3497
4320
  }
3498
4321
  } else {
3499
- ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
3500
- ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
4322
+ const int64_t n_routed_rows = ids->ne[1] * n_ids;
4323
+ ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne10);
4324
+ ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne0);
3501
4325
 
3502
4326
  src1_row.data = src1_contiguous.get();
3503
4327
  dst_row.data = dst_contiguous.get();
3504
4328
 
3505
- for (int64_t i02 = 0; i02 < n_as; i02++) {
3506
- int64_t num_src1_rows = 0;
3507
- for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
3508
- for (int64_t id = 0; id < n_ids; id++) {
3509
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
4329
+ // how many "owned" routed rows to pass to each expert
4330
+ std::vector<int64_t> expert_row_counts;
4331
+ // where each expert's slice starts and the previous ends (row indices, right-exclusive)
4332
+ std::vector<int64_t> expert_row_offsets;
4333
+ // the sources (slot/token pairs) of contiguous rows to guide k_copy_src1_to_contiguous
4334
+ std::vector<mmid_row_mapping> routed_row_src;
3510
4335
 
3511
- GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
4336
+ mmid_counting_sort_rows(ids, ids_host.data(), n_ids, n_as, n_routed_rows,
4337
+ expert_row_counts, expert_row_offsets, routed_row_src);
3512
4338
 
3513
- if (row_id_i != i02) {
3514
- continue;
3515
- }
4339
+ ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), n_routed_rows);
4340
+ SYCL_CHECK(CHECK_TRY_ERROR(
4341
+ stream->memcpy(dev_row_mapping.get(), routed_row_src.data(), n_routed_rows*sizeof(mmid_row_mapping))));
3516
4342
 
3517
- num_src1_rows++;
3518
- }
3519
- }
4343
+ const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
4344
+ assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
4345
+
4346
+ {
4347
+ sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
4348
+ sycl::range<3> grid_dims(1, 1, n_routed_rows);
4349
+ stream->submit([&](sycl::handler &cgh) {
4350
+ char *__restrict src1_contiguous_get =
4351
+ src1_contiguous.get();
4352
+ mmid_row_mapping *__restrict dev_row_mapping_get =
4353
+ dev_row_mapping.get();
4354
+
4355
+ cgh.parallel_for(
4356
+ sycl::nd_range<3>(grid_dims * block_dims, block_dims),
4357
+ [=](sycl::nd_item<3> item_ct1) {
4358
+ k_copy_src1_to_contiguous(
4359
+ src1_original, src1_contiguous_get,
4360
+ dev_row_mapping_get,
4361
+ ne11, ne10, nb11, nb12,
4362
+ item_ct1);
4363
+ });
4364
+ });
4365
+ }
4366
+
4367
+ for (int64_t i02 = 0; i02 < n_as; i02++) {
4368
+ const int64_t num_src1_rows = expert_row_counts[i02];
3520
4369
 
3521
4370
  if (num_src1_rows == 0) {
3522
4371
  continue;
3523
4372
  }
3524
4373
 
3525
-
3526
- ggml_sycl_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
3527
- ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
3528
- SYCL_CHECK(CHECK_TRY_ERROR(
3529
- stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
3530
-
3531
- const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
3532
- assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
3533
-
3534
- {
3535
- sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
3536
- sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
3537
- stream->submit([&](sycl::handler &cgh) {
3538
- sycl::local_accessor<int, 0> src1_row_acc(cgh);
3539
-
3540
- char *__restrict src1_contiguous_get =
3541
- src1_contiguous.get();
3542
- int *__restrict dev_cur_src1_row_get =
3543
- dev_cur_src1_row.get();
3544
- mmid_row_mapping *__restrict dev_row_mapping_get =
3545
- dev_row_mapping.get();
3546
- size_t ids_nb_ct6 = ids->nb[1];
3547
- size_t ids_nb_ct7 = ids->nb[0];
3548
-
3549
- cgh.parallel_for(
3550
- sycl::nd_range<3>(grid_dims * block_dims, block_dims),
3551
- [=](sycl::nd_item<3> item_ct1) {
3552
- k_copy_src1_to_contiguous(
3553
- src1_original, src1_contiguous_get,
3554
- dev_cur_src1_row_get,
3555
- dev_row_mapping_get, ids_dev, i02,
3556
- ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
3557
- item_ct1, src1_row_acc);
3558
- });
3559
- });
3560
- }
4374
+ const int64_t expert_row_offset = expert_row_offsets[i02];
3561
4375
 
3562
4376
  src0_row.data = src0_original + i02*nb02;
3563
4377
 
3564
4378
  GGML_ASSERT(nb11 == sizeof(float)*ne10);
3565
4379
  GGML_ASSERT(nb1 == sizeof(float)*ne0);
4380
+ src1_row.data = src1_contiguous.get() + expert_row_offset*nb11;
3566
4381
  src1_row.ne[1] = num_src1_rows;
3567
4382
 
3568
4383
  src1_row.nb[1] = nb11;
3569
4384
  src1_row.nb[2] = num_src1_rows*nb11;
3570
4385
  src1_row.nb[3] = num_src1_rows*nb11;
3571
4386
 
4387
+ dst_row.data = dst_contiguous.get() + expert_row_offset*nb1;
3572
4388
  dst_row.ne[1] = num_src1_rows;
3573
4389
  dst_row.nb[1] = nb1;
3574
4390
  dst_row.nb[2] = num_src1_rows*nb1;
3575
4391
  dst_row.nb[3] = num_src1_rows*nb1;
3576
4392
 
3577
4393
  ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
4394
+ }
3578
4395
 
3579
- {
3580
- sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
3581
- sycl::range<3> grid_dims(1, 1, num_src1_rows);
3582
- stream->submit([&](sycl::handler &cgh) {
3583
- const char *__restrict dst_contiguous_get =
3584
- dst_contiguous.get();
3585
- const mmid_row_mapping *__restrict dev_row_mapping_get =
3586
- dev_row_mapping.get();
3587
-
3588
- cgh.parallel_for(
3589
- sycl::nd_range<3>(grid_dims * block_dims, block_dims),
3590
- [=](sycl::nd_item<3> item_ct1) {
3591
- k_copy_dst_from_contiguous(dst_original,
3592
- dst_contiguous_get,
3593
- dev_row_mapping_get,
3594
- ne0, nb1, nb2, item_ct1);
3595
- });
3596
- });
3597
- }
4396
+ {
4397
+ sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
4398
+ sycl::range<3> grid_dims(1, 1, n_routed_rows);
4399
+ stream->submit([&](sycl::handler &cgh) {
4400
+ const char *__restrict dst_contiguous_get =
4401
+ dst_contiguous.get();
4402
+ const mmid_row_mapping *__restrict dev_row_mapping_get =
4403
+ dev_row_mapping.get();
4404
+
4405
+ cgh.parallel_for(
4406
+ sycl::nd_range<3>(grid_dims * block_dims, block_dims),
4407
+ [=](sycl::nd_item<3> item_ct1) {
4408
+ k_copy_dst_from_contiguous(dst_original,
4409
+ dst_contiguous_get,
4410
+ dev_row_mapping_get,
4411
+ ne0, nb1, nb2, item_ct1);
4412
+ });
4413
+ });
3598
4414
  }
3599
4415
  }
3600
4416
  }
@@ -3624,6 +4440,11 @@ static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
3624
4440
  ggml_sycl_op_im2col(ctx, dst);
3625
4441
  }
3626
4442
 
4443
+ static void ggml_sycl_im2col_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
4444
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
4445
+ ggml_sycl_op_im2col_3d(ctx, dst);
4446
+ }
4447
+
3627
4448
  static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3628
4449
  scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3629
4450
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
@@ -3771,6 +4592,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3771
4592
  case GGML_UNARY_OP_EXP:
3772
4593
  ggml_sycl_exp(ctx, dst);
3773
4594
  break;
4595
+ case GGML_UNARY_OP_SOFTPLUS:
4596
+ ggml_sycl_softplus(ctx, dst);
4597
+ break;
3774
4598
  case GGML_UNARY_OP_SGN:
3775
4599
  ggml_sycl_sgn(ctx, dst);
3776
4600
  break;
@@ -3897,6 +4721,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3897
4721
  case GGML_OP_TRANSPOSE:
3898
4722
  GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__);
3899
4723
  break;
4724
+ case GGML_OP_TRI:
4725
+ ggml_sycl_op_tri(ctx, dst);
4726
+ break;
3900
4727
  case GGML_OP_DIAG_MASK_INF:
3901
4728
  ggml_sycl_diag_mask_inf(ctx, dst);
3902
4729
  break;
@@ -3909,9 +4736,15 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3909
4736
  case GGML_OP_ROPE:
3910
4737
  ggml_sycl_rope(ctx, dst);
3911
4738
  break;
4739
+ case GGML_OP_ROPE_BACK:
4740
+ ggml_sycl_rope_back(ctx, dst);
4741
+ break;
3912
4742
  case GGML_OP_IM2COL:
3913
4743
  ggml_sycl_im2col(ctx, dst);
3914
4744
  break;
4745
+ case GGML_OP_IM2COL_3D:
4746
+ ggml_sycl_im2col_3d(ctx, dst);
4747
+ break;
3915
4748
  case GGML_OP_POOL_2D:
3916
4749
  ggml_sycl_pool2d(ctx, dst);
3917
4750
  break;
@@ -3927,6 +4760,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3927
4760
  case GGML_OP_ARGSORT:
3928
4761
  ggml_sycl_argsort(ctx, dst);
3929
4762
  break;
4763
+ case GGML_OP_TOP_K:
4764
+ ggml_sycl_op_top_k(ctx, dst);
4765
+ break;
3930
4766
  case GGML_OP_TIMESTEP_EMBEDDING:
3931
4767
  ggml_sycl_op_timestep_embedding(ctx, dst);
3932
4768
  break;
@@ -3939,15 +4775,36 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3939
4775
  case GGML_OP_GATED_LINEAR_ATTN:
3940
4776
  ggml_sycl_op_gated_linear_attn(ctx, dst);
3941
4777
  break;
4778
+ case GGML_OP_GATED_DELTA_NET:
4779
+ ggml_sycl_gated_delta_net(ctx, dst);
4780
+ break;
3942
4781
  case GGML_OP_SSM_CONV:
3943
4782
  ggml_sycl_ssm_conv(ctx, dst);
3944
4783
  break;
4784
+ case GGML_OP_SSM_SCAN:
4785
+ ggml_sycl_ssm_scan(ctx, dst);
4786
+ break;
4787
+ case GGML_OP_FILL:
4788
+ ggml_sycl_fill(ctx, dst);
4789
+ break;
4790
+ case GGML_OP_CUMSUM:
4791
+ ggml_sycl_cumsum(ctx, dst);
4792
+ break;
4793
+ case GGML_OP_DIAG:
4794
+ ggml_sycl_diag(ctx, dst);
4795
+ break;
4796
+ case GGML_OP_SOLVE_TRI:
4797
+ ggml_sycl_solve_tri(ctx, dst);
4798
+ break;
3945
4799
  case GGML_OP_ROLL:
3946
4800
  ggml_sycl_roll(ctx, dst);
3947
4801
  break;
3948
4802
  case GGML_OP_ARANGE:
3949
4803
  ggml_sycl_arange(ctx, dst);
3950
4804
  break;
4805
+ case GGML_OP_FLASH_ATTN_EXT:
4806
+ ggml_sycl_flash_attn_ext(ctx, dst);
4807
+ break;
3951
4808
  default:
3952
4809
  return false;
3953
4810
  }
@@ -3978,16 +4835,6 @@ void ggml_backend_sycl_get_device_memory(int device, size_t *free,
3978
4835
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
3979
4836
  ggml_sycl_set_device(device);
3980
4837
 
3981
- /*
3982
- DPCT1009:218: SYCL uses exceptions to report errors and does not use the
3983
- error codes. The original code was commented out and a warning string was
3984
- inserted. You need to rewrite this code.
3985
- */
3986
- /*
3987
- DPCT1106:217: 'cudaMemGetInfo' was migrated with the Intel extensions for
3988
- device information which may not be supported by all compilers or runtimes.
3989
- You may need to adjust the code.
3990
- */
3991
4838
  SYCL_CHECK(CHECK_TRY_ERROR(
3992
4839
  dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total)));
3993
4840
  }
@@ -4109,6 +4956,9 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
4109
4956
  if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
4110
4957
  continue;
4111
4958
  }
4959
+ if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
4960
+ continue;
4961
+ }
4112
4962
  #ifndef NDEBUG
4113
4963
  assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
4114
4964
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -4252,6 +5102,8 @@ static ggml_backend_i ggml_backend_sycl_interface = {
4252
5102
  /* .free = */ ggml_backend_sycl_free,
4253
5103
  /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
4254
5104
  /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
5105
+ /* .set_tensor_2d_async = */ NULL,
5106
+ /* .get_tensor_2d_async = */ NULL,
4255
5107
  /* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
4256
5108
  // // TODO: update for the new
4257
5109
  // interface
@@ -4386,10 +5238,11 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4386
5238
  case GGML_UNARY_OP_GELU_QUICK:
4387
5239
  case GGML_UNARY_OP_GELU_ERF:
4388
5240
  case GGML_UNARY_OP_EXP:
5241
+ case GGML_UNARY_OP_SOFTPLUS:
4389
5242
  case GGML_UNARY_OP_ELU:
5243
+ case GGML_UNARY_OP_CEIL:
4390
5244
  return true;
4391
5245
  case GGML_UNARY_OP_FLOOR:
4392
- case GGML_UNARY_OP_CEIL:
4393
5246
  case GGML_UNARY_OP_ROUND:
4394
5247
  case GGML_UNARY_OP_TRUNC:
4395
5248
  #if defined (GGML_SYCL_F16)
@@ -4419,26 +5272,19 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4419
5272
  struct ggml_tensor * a = op->src[0];
4420
5273
  struct ggml_tensor * b = op->src[1];
4421
5274
 
4422
- if (a->ne[3] != b->ne[3]) {
5275
+ // disable Q1_0 until implementation
5276
+ if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) {
4423
5277
  return false;
4424
5278
  }
4425
- ggml_type a_type = a->type;
4426
- if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS ||
4427
- a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
4428
- a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S ||
4429
- a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M
4430
- ) {
4431
- if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
4432
- return false;
4433
- }
4434
- }
4435
- ggml_type src0_type = op->src[0]->type;
4436
- if (src0_type == GGML_TYPE_BF16 ) {
4437
- // TODO: support GGML_TYPE_BF16
4438
- // FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
5279
+
5280
+ if (a->ne[3] != b->ne[3]) {
4439
5281
  return false;
4440
5282
  }
4441
5283
 
5284
+ ggml_type src0_type = op->src[0]->type;
5285
+
5286
+
5287
+
4442
5288
  // TODO: The configuration below needs more work to be supported with oneDNN
4443
5289
  if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
4444
5290
  a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
@@ -4457,12 +5303,31 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4457
5303
  case GGML_OP_GET_ROWS:
4458
5304
  {
4459
5305
  switch (op->src[0]->type) {
5306
+ case GGML_TYPE_I32:
4460
5307
  case GGML_TYPE_F16:
5308
+ case GGML_TYPE_BF16:
4461
5309
  case GGML_TYPE_F32:
5310
+ case GGML_TYPE_Q1_0:
5311
+ case GGML_TYPE_MXFP4:
5312
+ case GGML_TYPE_NVFP4:
5313
+ case GGML_TYPE_IQ2_XXS:
5314
+ case GGML_TYPE_IQ2_XS:
5315
+ case GGML_TYPE_IQ2_S:
5316
+ case GGML_TYPE_IQ3_XXS:
5317
+ case GGML_TYPE_IQ1_S:
5318
+ case GGML_TYPE_IQ1_M:
5319
+ case GGML_TYPE_IQ3_S:
5320
+ case GGML_TYPE_IQ4_NL:
5321
+ case GGML_TYPE_IQ4_XS:
5322
+ case GGML_TYPE_Q2_K:
5323
+ case GGML_TYPE_Q3_K:
4462
5324
  case GGML_TYPE_Q4_0:
4463
5325
  case GGML_TYPE_Q4_1:
5326
+ case GGML_TYPE_Q4_K:
4464
5327
  case GGML_TYPE_Q5_0:
4465
5328
  case GGML_TYPE_Q5_1:
5329
+ case GGML_TYPE_Q5_K:
5330
+ case GGML_TYPE_Q6_K:
4466
5331
  case GGML_TYPE_Q8_0:
4467
5332
  return true;
4468
5333
  default:
@@ -4588,18 +5453,23 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4588
5453
  return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
4589
5454
  #endif
4590
5455
  case GGML_OP_NORM:
4591
- return true;
4592
5456
  case GGML_OP_L2_NORM:
4593
5457
  case GGML_OP_GROUP_NORM:
4594
- return ggml_is_contiguous(op->src[0]);
4595
5458
  case GGML_OP_RMS_NORM:
4596
- return ((op->src[0]->ne[0] % WARP_SIZE) == 0);
5459
+ return true;
4597
5460
  case GGML_OP_RMS_NORM_BACK:
4598
- return ((op->src[0]->ne[0] % WARP_SIZE) == 0);
5461
+ return ggml_is_contiguous(op->src[0]);
4599
5462
  case GGML_OP_SCALE:
4600
5463
  return true;
4601
5464
  case GGML_OP_CONT:
4602
5465
  return op->src[0]->type != GGML_TYPE_BF16;
5466
+ case GGML_OP_TRI:
5467
+ {
5468
+ const ggml_tensor * src0 = op->src[0];
5469
+ return src0 &&
5470
+ op->type == GGML_TYPE_F32 &&
5471
+ ggml_is_contiguous(src0);
5472
+ }
4603
5473
  case GGML_OP_DIAG_MASK_INF:
4604
5474
  return true;
4605
5475
  case GGML_OP_SOFT_MAX:
@@ -4610,10 +5480,11 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4610
5480
  return max_bias == 0.0f;
4611
5481
  }
4612
5482
  case GGML_OP_ROPE:
5483
+ case GGML_OP_ROPE_BACK:
4613
5484
  case GGML_OP_IM2COL:
4614
- return true;
5485
+ case GGML_OP_IM2COL_3D:
4615
5486
  case GGML_OP_UPSCALE:
4616
- return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
5487
+ return true;
4617
5488
  case GGML_OP_SUM:
4618
5489
  case GGML_OP_SUM_ROWS:
4619
5490
  case GGML_OP_MEAN:
@@ -4621,20 +5492,30 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4621
5492
  case GGML_OP_ARGSORT:
4622
5493
  return op->src[0]->ne[0] * sizeof(int) <=
4623
5494
  ggml_sycl_info().devices[device].smpbo;
5495
+ case GGML_OP_TOP_K: {
5496
+ const ggml_tensor * src0 = op->src[0];
5497
+ const int k = op->ne[0];
5498
+ return src0 &&
5499
+ op->type == GGML_TYPE_I32 &&
5500
+ src0->type == GGML_TYPE_F32 &&
5501
+ ggml_is_contiguous(src0) &&
5502
+ k > 0 && k <= 32;
5503
+ }
4624
5504
  case GGML_OP_POOL_2D:
4625
- case GGML_OP_ACC:
4626
5505
  return true;
5506
+ case GGML_OP_ACC:
5507
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
4627
5508
  case GGML_OP_PAD:
4628
- // TODO: add circular padding support for syscl, see https://github.com/ggml-org/llama.cpp/pull/16985
4629
5509
  if (ggml_get_op_params_i32(op, 8) != 0) {
4630
5510
  return false;
4631
5511
  }
4632
- return ggml_is_contiguous(op->src[0]);
5512
+ return true;
4633
5513
  case GGML_OP_LEAKY_RELU:
4634
5514
  case GGML_OP_TIMESTEP_EMBEDDING:
4635
5515
  case GGML_OP_RWKV_WKV6:
4636
5516
  case GGML_OP_RWKV_WKV7:
4637
5517
  case GGML_OP_GATED_LINEAR_ATTN:
5518
+ case GGML_OP_GATED_DELTA_NET:
4638
5519
  return true;
4639
5520
  case GGML_OP_SSM_CONV:
4640
5521
  return op->type == GGML_TYPE_F32 &&
@@ -4644,6 +5525,23 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4644
5525
  return op->type == GGML_TYPE_F32;
4645
5526
  case GGML_OP_ARANGE:
4646
5527
  return op->type == GGML_TYPE_F32;
5528
+ case GGML_OP_SSM_SCAN:
5529
+ if (op->src[3]->ne[0] == 1) {
5530
+ // Mamba2
5531
+ // (kernel only supports (d_state == 128 || d_state == 256) && d_head % WARP_SIZE == 0)
5532
+ return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % WARP_SIZE == 0;
5533
+ } else {
5534
+ // TODO Mamba-1 not yet ported to SYCL
5535
+ return false;
5536
+ }
5537
+ case GGML_OP_FILL:
5538
+ case GGML_OP_CUMSUM:
5539
+ case GGML_OP_DIAG:
5540
+ return true;
5541
+ case GGML_OP_SOLVE_TRI:
5542
+ return op->src[0]->ne[0] <= SYCL_SOLVE_TRI_MAX_N && op->src[1]->ne[0] <= SYCL_SOLVE_TRI_MAX_K;
5543
+ case GGML_OP_FLASH_ATTN_EXT:
5544
+ return ggml_sycl_flash_attn_ext_supported(device, op);
4647
5545
  default:
4648
5546
  return false;
4649
5547
  }