whispercpp 1.3.6 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (828) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/README.md +38 -5
  5. data/Rakefile +18 -3
  6. data/ext/dependencies.rb +10 -4
  7. data/ext/dependencies_for_windows.rb +17 -0
  8. data/ext/extconf.rb +20 -8
  9. data/ext/options.rb +54 -14
  10. data/ext/options_for_windows.rb +51 -0
  11. data/ext/ruby_whisper.c +36 -42
  12. data/ext/ruby_whisper.h +135 -0
  13. data/ext/ruby_whisper_context.c +107 -28
  14. data/ext/ruby_whisper_log_queue.c +180 -0
  15. data/ext/ruby_whisper_log_settable.h +47 -0
  16. data/ext/ruby_whisper_parakeet.c +49 -0
  17. data/ext/ruby_whisper_parakeet_context.c +304 -0
  18. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  19. data/ext/ruby_whisper_parakeet_model.c +84 -0
  20. data/ext/ruby_whisper_parakeet_params.c +548 -0
  21. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  22. data/ext/ruby_whisper_parakeet_token.c +188 -0
  23. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  24. data/ext/ruby_whisper_params.c +256 -65
  25. data/ext/ruby_whisper_segment.c +6 -6
  26. data/ext/ruby_whisper_transcribe.cpp +42 -15
  27. data/ext/sources/CMakeLists.txt +41 -3
  28. data/ext/sources/CMakePresets.json +95 -0
  29. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  30. data/ext/sources/cmake/parakeet.pc.in +10 -0
  31. data/ext/sources/cmake/whisper.pc.in +1 -1
  32. data/ext/sources/examples/CMakeLists.txt +4 -2
  33. data/ext/sources/examples/bench/bench.cpp +1 -1
  34. data/ext/sources/examples/cli/cli.cpp +43 -9
  35. data/ext/sources/examples/common-ggml.cpp +2 -0
  36. data/ext/sources/examples/common-whisper.cpp +139 -67
  37. data/ext/sources/examples/common-whisper.h +11 -0
  38. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  39. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  40. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  41. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  42. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  43. data/ext/sources/examples/server/server.cpp +199 -163
  44. data/ext/sources/ggml/CMakeLists.txt +21 -13
  45. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  46. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  47. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  48. data/ext/sources/ggml/include/ggml-backend.h +72 -10
  49. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  50. data/ext/sources/ggml/include/ggml-rpc.h +3 -3
  51. data/ext/sources/ggml/include/ggml.h +101 -9
  52. data/ext/sources/ggml/include/gguf.h +10 -2
  53. data/ext/sources/ggml/src/CMakeLists.txt +22 -5
  54. data/ext/sources/ggml/src/ggml-alloc.c +5 -1
  55. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  56. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  57. data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
  58. data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
  59. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
  60. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
  61. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
  62. data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
  63. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
  64. data/ext/sources/ggml/src/ggml-common.h +11 -0
  65. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
  66. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
  67. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
  68. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
  69. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
  70. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  71. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  72. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
  73. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
  74. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
  75. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  76. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
  77. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
  78. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
  79. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  80. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
  81. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
  82. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  83. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
  84. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
  85. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
  86. data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
  87. data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
  88. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  89. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
  90. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
  91. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
  92. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  93. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  94. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  95. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  96. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  97. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  98. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  99. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  100. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  101. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  102. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  103. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  104. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  105. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  106. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  107. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
  108. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  109. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
  110. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  111. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  112. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
  113. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
  114. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  115. data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
  116. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  117. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  118. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  119. data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
  120. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  121. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
  122. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  123. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
  124. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
  125. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
  129. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
  130. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  131. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  132. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  133. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
  134. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  135. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
  136. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  137. data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
  138. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
  139. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
  140. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  141. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
  142. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
  143. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
  144. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
  145. data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
  146. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  147. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
  148. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  149. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
  150. data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
  151. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  152. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  153. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  154. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  155. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  156. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
  157. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  158. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  159. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  160. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  161. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
  162. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  163. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  164. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  165. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
  166. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  167. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  168. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  169. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  170. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  171. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  172. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  173. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  174. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  176. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  177. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  178. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  179. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  191. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
  192. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
  193. data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
  194. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  195. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
  196. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  197. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
  198. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  199. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
  200. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
  201. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
  202. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
  203. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
  204. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
  205. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  206. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  207. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
  208. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  209. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  210. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  211. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
  212. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  213. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
  214. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
  215. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
  216. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
  217. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
  218. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  219. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  220. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  221. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  222. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  223. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  224. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  225. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  226. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
  227. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
  228. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  229. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
  230. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
  231. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
  232. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
  233. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  235. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
  254. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
  255. data/ext/sources/ggml/src/ggml-impl.h +6 -1
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
  259. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
  260. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
  261. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
  262. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
  263. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
  264. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  265. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
  266. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
  322. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
  323. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
  324. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
  325. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
  326. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
  327. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  328. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
  329. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
  330. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  331. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
  332. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
  333. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
  334. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
  335. data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
  336. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  337. data/ext/sources/ggml/src/ggml-quants.c +289 -114
  338. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  339. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  340. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  341. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  342. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  343. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
  344. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
  345. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
  346. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  347. data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
  348. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
  349. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
  350. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  351. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  352. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  353. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  354. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  355. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  356. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
  357. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
  358. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  359. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  360. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
  361. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
  362. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
  363. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
  364. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
  365. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  366. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  367. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
  368. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
  369. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  370. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  371. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
  372. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  373. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  374. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  375. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  376. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  377. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
  378. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  379. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  380. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  381. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  382. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  383. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  384. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  385. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  386. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  387. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
  388. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
  389. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
  390. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
  391. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
  392. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
  393. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
  394. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
  395. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
  396. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
  397. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
  398. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
  399. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
  400. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
  401. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
  402. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
  403. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
  404. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
  405. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
  406. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
  407. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
  408. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
  409. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
  410. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
  411. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
  412. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
  413. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
  414. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
  415. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
  416. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
  417. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
  418. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
  420. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
  421. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
  422. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
  423. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  424. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  425. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  426. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
  427. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
  428. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
  429. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
  430. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
  431. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
  432. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
  433. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
  434. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
  484. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  485. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
  486. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
  487. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  488. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  489. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
  490. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
  491. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
  492. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  493. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
  494. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
  495. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  496. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  497. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  498. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  499. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  500. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  501. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
  502. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  503. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  504. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
  505. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  506. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  507. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  508. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
  509. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
  510. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
  511. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  512. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  513. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  514. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  515. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  516. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  517. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  518. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
  519. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  520. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
  521. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  522. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  523. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  524. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  525. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  526. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
  527. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  528. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
  529. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
  530. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
  531. data/ext/sources/ggml/src/ggml.c +110 -28
  532. data/ext/sources/ggml/src/gguf.cpp +173 -28
  533. data/ext/sources/include/parakeet.h +342 -0
  534. data/ext/sources/include/whisper.h +10 -0
  535. data/ext/sources/media/matmul.png +0 -0
  536. data/ext/sources/src/CMakeLists.txt +23 -0
  537. data/ext/sources/src/parakeet-arch.h +188 -0
  538. data/ext/sources/src/parakeet.cpp +3838 -0
  539. data/ext/sources/src/whisper.cpp +56 -12
  540. data/extsources.rb +26 -10
  541. data/lib/whisper/log_settable.rb +36 -0
  542. data/lib/whisper/model/uri.rb +13 -1
  543. data/lib/whisper/output.rb +74 -0
  544. data/sig/whisper.rbs +411 -62
  545. data/test/helper.rb +2 -0
  546. data/test/jfk_reader/jfk_reader.c +50 -7
  547. data/test/test_callback.rb +1 -0
  548. data/test/test_package.rb +6 -5
  549. data/test/test_parakeet.rb +28 -0
  550. data/test/test_parakeet_callback.rb +107 -0
  551. data/test/test_parakeet_context.rb +116 -0
  552. data/test/test_parakeet_context_params.rb +24 -0
  553. data/test/test_parakeet_model.rb +21 -0
  554. data/test/test_parakeet_params.rb +78 -0
  555. data/test/test_parakeet_segment.rb +42 -0
  556. data/test/test_parakeet_token.rb +73 -0
  557. data/test/test_params.rb +2 -0
  558. data/test/test_vad_segment.rb +1 -1
  559. data/test/test_whisper.rb +24 -6
  560. data/whispercpp.gemspec +2 -2
  561. metadata +215 -281
  562. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  563. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  564. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  565. data/ext/sources/bindings/javascript/package.json +0 -26
  566. data/ext/sources/bindings/javascript/whisper.js +0 -19
  567. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  568. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  569. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  570. data/ext/sources/examples/addon.node/index.js +0 -59
  571. data/ext/sources/examples/addon.node/package.json +0 -16
  572. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  573. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  574. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  575. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  576. data/ext/sources/examples/coi-serviceworker.js +0 -146
  577. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  578. data/ext/sources/examples/command/command.cpp +0 -802
  579. data/ext/sources/examples/command/commands.txt +0 -9
  580. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  581. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  582. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  583. data/ext/sources/examples/generate-karaoke.sh +0 -57
  584. data/ext/sources/examples/helpers.js +0 -191
  585. data/ext/sources/examples/livestream.sh +0 -112
  586. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  587. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  588. data/ext/sources/examples/lsp/whisper.vim +0 -362
  589. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  590. data/ext/sources/examples/python/whisper_processor.py +0 -54
  591. data/ext/sources/examples/server/bench.js +0 -29
  592. data/ext/sources/examples/server.py +0 -120
  593. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  594. data/ext/sources/examples/stream/stream.cpp +0 -437
  595. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  596. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  597. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  598. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  599. data/ext/sources/examples/sycl/build.sh +0 -22
  600. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  601. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  602. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
  603. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  604. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
  605. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
  606. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
  607. data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
  608. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
  609. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  610. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
  611. data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
  612. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
  613. data/ext/sources/examples/talk-llama/llama-context.h +0 -359
  614. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  615. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
  616. data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
  617. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  618. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  619. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
  620. data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
  621. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
  622. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
  623. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  624. data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
  625. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  626. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  627. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
  628. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  629. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
  630. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
  631. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  632. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
  633. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
  634. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  635. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  636. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
  637. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  638. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  639. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  640. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
  641. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  642. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
  643. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
  644. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
  645. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
  646. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
  647. data/ext/sources/examples/talk-llama/llama-model.h +0 -597
  648. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
  649. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  650. data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
  651. data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
  652. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
  653. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
  654. data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
  655. data/ext/sources/examples/talk-llama/llama.h +0 -1573
  656. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
  657. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  658. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  659. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
  660. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  661. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
  662. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
  663. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
  664. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
  665. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
  666. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  667. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  668. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  669. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  670. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  671. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  672. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  673. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
  674. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  675. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
  676. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
  677. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
  678. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
  679. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  680. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
  681. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  682. data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
  683. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
  684. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  685. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  686. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
  687. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  688. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  689. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  690. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  691. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  692. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  693. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  694. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
  695. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  696. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  697. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
  698. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
  699. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  700. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
  701. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  702. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
  703. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  704. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  705. data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
  706. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  707. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
  708. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
  709. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  710. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  711. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  712. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
  713. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  714. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
  715. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
  716. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
  717. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
  718. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
  719. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  720. data/ext/sources/examples/talk-llama/models/models.h +0 -704
  721. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
  722. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  723. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
  724. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  725. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  726. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  727. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  728. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  729. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  730. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  731. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  732. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
  733. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  734. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  735. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  736. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  737. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
  738. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  739. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
  740. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  741. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  742. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  743. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  744. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
  745. data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
  746. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
  747. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
  748. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
  749. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
  750. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
  751. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  752. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  753. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
  754. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  755. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  756. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
  757. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  758. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  759. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  760. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  761. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  762. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  763. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  764. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
  765. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  766. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  767. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  768. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  769. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  770. data/ext/sources/examples/talk-llama/speak +0 -40
  771. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  772. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  773. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  774. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  775. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  776. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
  777. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  778. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  779. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  780. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  781. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  782. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  783. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  784. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  785. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  786. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  787. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  788. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  789. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  790. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  791. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
  792. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  793. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  794. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
  795. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
  796. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  798. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
  799. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  800. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
  801. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  802. data/ext/sources/tests/CMakeLists.txt +0 -112
  803. data/ext/sources/tests/earnings21/eval.mk +0 -58
  804. data/ext/sources/tests/earnings21/eval.py +0 -68
  805. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  806. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  807. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  808. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  809. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  810. data/ext/sources/tests/en-0-ref.txt +0 -1
  811. data/ext/sources/tests/en-1-ref.txt +0 -1
  812. data/ext/sources/tests/en-2-ref.txt +0 -1
  813. data/ext/sources/tests/es-0-ref.txt +0 -1
  814. data/ext/sources/tests/librispeech/eval.mk +0 -39
  815. data/ext/sources/tests/librispeech/eval.py +0 -47
  816. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  817. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  818. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  819. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  820. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  821. data/ext/sources/tests/run-tests.sh +0 -130
  822. data/ext/sources/tests/test-c.c +0 -3
  823. data/ext/sources/tests/test-vad-full.cpp +0 -56
  824. data/ext/sources/tests/test-vad.cpp +0 -83
  825. data/ext/sources/tests/test-whisper.js +0 -58
  826. data/lib/whisper/context.rb +0 -15
  827. data/lib/whisper/segment.rb +0 -58
  828. /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
@@ -19,6 +19,7 @@
19
19
  #include <cstdlib>
20
20
  #include <float.h>
21
21
  #include <limits>
22
+ #include <optional>
22
23
  #include <stdint.h>
23
24
  #include <stdio.h>
24
25
  #include <vector>
@@ -30,9 +31,18 @@
30
31
  #include <regex>
31
32
 
32
33
  #include <sycl/sycl.hpp>
34
+ #include <sycl/backend.hpp>
35
+ #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
36
+ #include <level_zero/ze_api.h>
37
+ #endif
33
38
  #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
34
39
  # include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
35
40
  #endif
41
+ #if SYCL_EXT_ONEAPI_VIRTUAL_MEM
42
+ # include <sycl/ext/oneapi/virtual_mem/physical_mem.hpp>
43
+ # include <sycl/ext/oneapi/virtual_mem/virtual_mem.hpp>
44
+ # define GGML_SYCL_USE_VMM
45
+ #endif
36
46
  #include <sycl/half_type.hpp>
37
47
 
38
48
  #include "ggml.h"
@@ -44,7 +54,6 @@
44
54
  #include "ggml-sycl/backend.hpp"
45
55
  #include "ggml-sycl/common.hpp"
46
56
  #include "ggml-sycl/element_wise.hpp"
47
- #include "ggml-sycl/gated_delta_net.hpp"
48
57
  #include "ggml-sycl/gemm.hpp"
49
58
  #include "ggml-sycl/getrows.hpp"
50
59
  #include "ggml-sycl/norm.hpp"
@@ -55,15 +64,23 @@
55
64
  #include "ggml-sycl/set.hpp"
56
65
  #include "ggml-sycl/ssm_conv.hpp"
57
66
  #include "ggml-sycl/sycl_hw.hpp"
58
-
67
+ #include "ggml-sycl/ssm_scan.hpp"
68
+ #include "ggml-sycl/fill.hpp"
69
+ #include "ggml-sycl/cumsum.hpp"
70
+ #include "ggml-sycl/diag.hpp"
71
+ #include "ggml-sycl/solve_tri.hpp"
72
+ #include "ggml-sycl/gated_delta_net.hpp"
59
73
 
60
74
  static bool g_sycl_loaded = false;
61
75
  int g_ggml_sycl_debug = 0;
62
76
  int g_ggml_sycl_disable_optimize = 0;
63
77
  int g_ggml_sycl_disable_graph = 0;
64
78
  int g_ggml_sycl_disable_dnn = 0;
79
+ int g_ggml_sycl_enable_vmm = 1;
65
80
  int g_ggml_sycl_prioritize_dmmv = 0;
66
81
  int g_ggml_sycl_use_async_mem_op = 0;
82
+ int g_ggml_sycl_use_async_mem_op_requested = 1;
83
+ int g_ggml_sycl_enable_level_zero = 0;
67
84
  int g_ggml_sycl_enable_flash_attention = 1;
68
85
 
69
86
 
@@ -86,13 +103,30 @@ static ggml_sycl_device_info ggml_sycl_init() {
86
103
  // GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
87
104
  // #endif
88
105
  for (int i = 0; i < info.device_count; ++i) {
89
- info.devices[i].vmm = 0;
90
106
  dpct::device_info prop;
91
- sycl::device device = dpct::dev_mgr::instance().get_device(i);
107
+ auto & device = dpct::dev_mgr::instance().get_device(i);
92
108
 
93
109
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
94
110
  prop, device)));
95
111
 
112
+ #if !defined(GGML_SYCL_USE_VMM)
113
+ info.devices[i].vmm = 0;
114
+ #else
115
+ info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem);
116
+ if (info.devices[i].vmm) {
117
+ // NB: SYCL's get_mem_granularity always returns the _minimum_ granularity,
118
+ // but the L0 API requires a larger page size for allocs above 2 MiB and
119
+ // rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic].
120
+ // Here we clamp it to 2 MiB for simplicity, but other devices may require
121
+ // calling zeVirtualMemQueryPageSize or yet unexposed public API.
122
+ const size_t physical_page = 2ull << 20; // 2 MiB
123
+ info.devices[i].vmm_granularity = std::max<size_t>(
124
+ sycl::ext::oneapi::experimental::get_mem_granularity(
125
+ device, sycl::context(device)),
126
+ physical_page);
127
+ }
128
+ #endif
129
+
96
130
  info.default_tensor_split[i] = total_vram;
97
131
  total_vram += prop.get_global_mem_size();
98
132
 
@@ -105,7 +139,14 @@ static ggml_sycl_device_info ggml_sycl_init() {
105
139
 
106
140
  info.max_work_group_sizes[i] = prop.get_max_work_group_size();
107
141
  info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
142
+ info.devices[i].hw_info = get_device_hw_info(&device);
108
143
 
144
+ // Only check GPU devices; CPU devices use OpenCL and would otherwise
145
+ // disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
146
+ if (device.is_gpu() && device.default_queue().get_backend() != sycl::backend::ext_oneapi_level_zero) {
147
+ GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
148
+ info.ext_oneapi_level_zero = false;
149
+ }
109
150
  }
110
151
 
111
152
  for (int id = 0; id < info.device_count; ++id) {
@@ -217,7 +258,13 @@ static void ggml_check_sycl() try {
217
258
  g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
218
259
  g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
219
260
  g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
261
+ g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
220
262
  g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
263
+ #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
264
+ g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
265
+ #else
266
+ g_ggml_sycl_enable_level_zero = 0;
267
+ #endif
221
268
 
222
269
  #ifdef SYCL_FLASH_ATTN
223
270
  g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
@@ -248,6 +295,16 @@ static void ggml_check_sycl() try {
248
295
  #else
249
296
  GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n");
250
297
  #endif
298
+ #if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO)
299
+ GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n");
300
+ #else
301
+ GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
302
+ #endif
303
+ #if defined(GGML_SYCL_USE_VMM)
304
+ GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
305
+ #else
306
+ GGML_LOG_INFO(" GGML_SYCL_USE_VMM: no\n");
307
+ #endif
251
308
 
252
309
  GGML_LOG_INFO("Running with Environment Variables:\n");
253
310
  GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
@@ -257,12 +314,24 @@ static void ggml_check_sycl() try {
257
314
  #else
258
315
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
259
316
  #endif
317
+ #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
318
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
319
+ #else
320
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
321
+ #endif
260
322
  #if GGML_SYCL_DNNL
261
323
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
262
324
  #else
263
325
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
326
+ #endif
327
+ #if defined(GGML_SYCL_USE_VMM)
328
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm);
329
+ #else
330
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
264
331
  #endif
265
332
  GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
333
+ g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
334
+ GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested);
266
335
 
267
336
  #ifdef SYCL_FLASH_ATTN
268
337
  GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention);
@@ -278,11 +347,11 @@ static void ggml_check_sycl() try {
278
347
  fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
279
348
  #endif
280
349
  */
281
- // Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
282
- // properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
283
- // other places.
350
+ // Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder
351
+ // staging path while preserving queue ordering semantics. Graph support still depends on the extension being
352
+ // available, but it no longer needs to control the non-graph fast path.
284
353
  #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
285
- g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
354
+ g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph;
286
355
  if (g_ggml_sycl_use_async_mem_op) {
287
356
  for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
288
357
  if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
@@ -366,7 +435,7 @@ struct ggml_backend_sycl_buffer_context {
366
435
  ~ggml_backend_sycl_buffer_context() {
367
436
  if (dev_ptr != nullptr) {
368
437
  ggml_sycl_set_device(device);
369
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
438
+ SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
370
439
  }
371
440
 
372
441
  //release extra used by tensors
@@ -412,11 +481,22 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
412
481
  assert(tensor->view_src->buffer->buft == buffer->buft);
413
482
  return GGML_STATUS_SUCCESS;
414
483
  }
415
- if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
416
- !g_ggml_sycl_disable_optimize) {
417
- ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
418
- tensor->extra = extra;
419
- ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
484
+
485
+ if (!g_ggml_sycl_disable_optimize) {
486
+ // set reorder extra buffer based on supported type
487
+ switch (tensor->type) {
488
+ case GGML_TYPE_Q4_0:
489
+ case GGML_TYPE_Q8_0:
490
+ case GGML_TYPE_Q4_K:
491
+ case GGML_TYPE_Q6_K:{
492
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
493
+ tensor->extra = extra;
494
+ ctx->tensor_extras.push_back(extra);
495
+ break;
496
+ }
497
+ default:
498
+ break;
499
+ }
420
500
  }
421
501
 
422
502
  if (ggml_is_quantized(tensor->type)) {
@@ -488,8 +568,43 @@ catch (sycl::exception const &exc) {
488
568
  std::exit(1);
489
569
  }
490
570
 
571
+ #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
572
+ static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) {
573
+ if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
574
+ return false;
575
+ }
576
+
577
+ ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
578
+ ze_device_properties_t props = {};
579
+ props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
580
+ ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
581
+ return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
582
+ }
583
+ #endif
584
+
491
585
  static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
492
586
  const void *ptr_src, size_t size) {
587
+ #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
588
+ // Use Level Zero direct copy for dGPU-to-dGPU transfers.
589
+ const bool l0_copy_supported =
590
+ ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src);
591
+ if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
592
+ auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
593
+ auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
594
+ ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
595
+ 0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
596
+ ze_command_list_handle_t cl;
597
+ ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
598
+ if (r == ZE_RESULT_SUCCESS) {
599
+ r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
600
+ zeCommandListDestroy(cl);
601
+ if (r == ZE_RESULT_SUCCESS) {
602
+ return;
603
+ }
604
+ }
605
+ }
606
+ #endif
607
+ // Host-staged copy
493
608
  char *host_buf = (char *)malloc(size);
494
609
  q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
495
610
  q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();
@@ -570,9 +685,15 @@ static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
570
685
  SYCL_CHECK(
571
686
  CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
572
687
 
573
- SYCL_CHECK(CHECK_TRY_ERROR((*stream)
574
- .memset(ctx->dev_ptr, value, buffer->size)
575
- .wait()));
688
+ constexpr size_t MAX_CHUNK = 2ULL << 30; // 2 GiB
689
+ for (size_t off = 0; off < buffer->size; off += MAX_CHUNK) {
690
+ size_t chunk = std::min(buffer->size - off, MAX_CHUNK);
691
+ SYCL_CHECK(CHECK_TRY_ERROR(
692
+ (*stream)
693
+ .memset(static_cast<char*>(ctx->dev_ptr) + off, value, chunk)
694
+ .wait()
695
+ ));
696
+ }
576
697
  }
577
698
  catch (sycl::exception const &exc) {
578
699
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -622,6 +743,8 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
622
743
  /* .memset_tensor = */ ggml_backend_sycl_buffer_memset_tensor,
623
744
  /* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor,
624
745
  /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor,
746
+ /* .set_tensor_2d = */ NULL,
747
+ /* .get_tensor_2d = */ NULL,
625
748
  /* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor,
626
749
  /* .clear = */ ggml_backend_sycl_buffer_clear,
627
750
  /* .reset = */ ggml_backend_sycl_buffer_reset,
@@ -651,8 +774,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
651
774
  size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
652
775
 
653
776
  void * dev_ptr;
654
- SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
655
- size, *stream)));
777
+ SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
656
778
  if (!dev_ptr) {
657
779
  GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
658
780
  return nullptr;
@@ -667,7 +789,7 @@ catch (sycl::exception const &exc) {
667
789
  }
668
790
 
669
791
  static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
670
- return 128;
792
+ return SYCL_BUFFER_ALIGNMENT;
671
793
  GGML_UNUSED(buft);
672
794
  }
673
795
 
@@ -893,18 +1015,10 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
893
1015
  size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
894
1016
  }
895
1017
 
896
- // FIXME: do not crash if SYCL Buffer alloc fails
897
- // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
898
1018
  ggml_sycl_set_device(i);
899
1019
  const queue_ptr stream = ctx->streams[i];
900
1020
  char * buf;
901
- /*
902
- DPCT1009:208: SYCL uses exceptions to report errors and does not use the
903
- error codes. The original code was commented out and a warning string
904
- was inserted. You need to rewrite this code.
905
- */
906
- SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
907
- size, *stream)));
1021
+ SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)ggml_sycl_malloc_device(size, *stream)));
908
1022
  if (!buf) {
909
1023
  char err_buf[1024];
910
1024
  snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
@@ -1068,6 +1182,8 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
1068
1182
  /* .memset_tensor = */ NULL,
1069
1183
  /* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor,
1070
1184
  /* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor,
1185
+ /* .set_tensor_2d = */ NULL,
1186
+ /* .get_tensor_2d = */ NULL,
1071
1187
  /* .cpy_tensor = */ NULL,
1072
1188
  /* .clear = */ ggml_backend_sycl_split_buffer_clear,
1073
1189
  /* .reset = */ NULL,
@@ -1096,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg
1096
1212
  }
1097
1213
 
1098
1214
  static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1099
- return 128;
1215
+ return SYCL_BUFFER_ALIGNMENT;
1100
1216
  GGML_UNUSED(buft);
1101
1217
  }
1102
1218
 
@@ -1260,16 +1376,53 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
1260
1376
  explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
1261
1377
 
1262
1378
  ~ggml_sycl_pool_leg() {
1379
+ #ifdef DEBUG_SYCL_POOL
1380
+ int n_cached = 0;
1381
+ size_t bytes_cached = 0;
1382
+ for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
1383
+ if (buffer_pool[i].ptr != nullptr) {
1384
+ ++n_cached;
1385
+ bytes_cached += buffer_pool[i].size;
1386
+ }
1387
+ }
1388
+ GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
1389
+ n_cached, bytes_cached / 1024.0 / 1024.0);
1390
+ const auto slots = format_slots_in_alloc_order();
1391
+ if (!slots.empty()) {
1392
+ GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
1393
+ }
1394
+ #endif
1395
+
1263
1396
  for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
1264
1397
  ggml_sycl_buffer & b = buffer_pool[i];
1265
1398
  if (b.ptr != nullptr) {
1266
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
1399
+ SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(b.ptr, *qptr)));
1267
1400
  pool_size -= b.size;
1268
1401
  }
1269
1402
  }
1270
1403
  GGML_ASSERT(pool_size == 0);
1271
1404
  }
1272
1405
 
1406
+ #ifdef DEBUG_SYCL_POOL
1407
+ std::string format_slots_in_alloc_order() const {
1408
+ std::string line;
1409
+ char buf[32];
1410
+ bool first = true;
1411
+ for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
1412
+ if (buffer_pool[i].ptr == nullptr) {
1413
+ continue;
1414
+ }
1415
+ if (!first) {
1416
+ line += '/';
1417
+ }
1418
+ first = false;
1419
+ snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
1420
+ line += buf;
1421
+ }
1422
+ return line;
1423
+ }
1424
+ #endif
1425
+
1273
1426
  void * alloc(size_t size, size_t * actual_size) override {
1274
1427
  #ifdef DEBUG_sycl_MALLOC
1275
1428
  int nnz = 0;
@@ -1311,9 +1464,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
1311
1464
  void * ptr;
1312
1465
  size_t look_ahead_size = (size_t) (1.05 * size);
1313
1466
 
1314
- SYCL_CHECK(
1315
- CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
1316
- look_ahead_size, *qptr)));
1467
+ SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *)ggml_sycl_malloc_device(look_ahead_size, *qptr)));
1317
1468
  if (!ptr) {
1318
1469
  GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
1319
1470
  return nullptr;
@@ -1341,11 +1492,126 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
1341
1492
  }
1342
1493
  }
1343
1494
  GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
1344
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
1495
+ SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(ptr, *qptr)));
1345
1496
  pool_size -= size;
1346
1497
  }
1347
1498
  };
1348
1499
 
1500
+ // pool with virtual memory management
1501
+ #if defined(GGML_SYCL_USE_VMM)
1502
+ struct ggml_sycl_pool_vmm : public ggml_sycl_pool {
1503
+ static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
1504
+
1505
+ int device;
1506
+ sycl::context ctx;
1507
+ sycl::device dev;
1508
+
1509
+ uintptr_t pool_addr = 0;
1510
+ size_t pool_used = 0;
1511
+ size_t pool_size = 0;
1512
+ size_t granularity;
1513
+
1514
+ // physical_mem owns the commits (unlike cuMemMap)
1515
+ struct mapping {
1516
+ sycl::ext::oneapi::experimental::physical_mem phys;
1517
+ void * map_ptr;
1518
+ };
1519
+ std::vector<mapping> mappings;
1520
+
1521
+ explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) :
1522
+ device(device_),
1523
+ ctx(qptr_->get_context()),
1524
+ dev(qptr_->get_device()),
1525
+ granularity(ggml_sycl_info().devices[device_].vmm_granularity) {
1526
+ }
1527
+
1528
+ ~ggml_sycl_pool_vmm() {
1529
+ if (pool_addr == 0) {
1530
+ return;
1531
+ }
1532
+
1533
+ // Per spec, unmap must (a) match the exact (ptr, size) of an earlier
1534
+ // physical_mem::map() call and (b) precede destruction of the
1535
+ // physical_mem objects (their dtors won't unmap).
1536
+ for (auto & m : mappings) {
1537
+ SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap(
1538
+ m.map_ptr, m.phys.size(), ctx)));
1539
+ }
1540
+ SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem(
1541
+ pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx)));
1542
+ }
1543
+
1544
+ void * alloc(size_t size, size_t * actual_size) override {
1545
+ // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
1546
+ size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT);
1547
+
1548
+ size_t avail = pool_size - pool_used;
1549
+
1550
+ if (size > avail) {
1551
+ // round up to the next multiple of the granularity
1552
+ size_t reserve_size = GGML_PAD(size - avail, granularity);
1553
+
1554
+ GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE);
1555
+
1556
+ // allocate more physical memory
1557
+ std::optional<sycl::ext::oneapi::experimental::physical_mem> phys;
1558
+ SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size)));
1559
+
1560
+ // reserve virtual address space (if not already reserved)
1561
+ if (pool_addr == 0) {
1562
+ SYCL_CHECK(CHECK_TRY_ERROR(
1563
+ pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem(
1564
+ SYCL_POOL_VMM_MAX_SIZE, ctx)));
1565
+ }
1566
+
1567
+ // map at the end of the pool
1568
+ void * map_ptr = nullptr;
1569
+ SYCL_CHECK(CHECK_TRY_ERROR(
1570
+ map_ptr = phys->map(pool_addr + pool_size, reserve_size,
1571
+ sycl::ext::oneapi::experimental::address_access_mode::read_write)));
1572
+
1573
+ // stash these so we could unmap this exact range in dtor
1574
+ mappings.push_back({
1575
+ std::move(*phys),
1576
+ map_ptr,
1577
+ });
1578
+
1579
+ // add to the pool
1580
+ pool_size += reserve_size;
1581
+
1582
+ #ifdef DEBUG_SYCL_MALLOC
1583
+ GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
1584
+ device, (unsigned long long) (pool_size/1024/1024),
1585
+ (unsigned long long) (reserve_size/1024/1024));
1586
+ #endif
1587
+ }
1588
+
1589
+ GGML_ASSERT(pool_addr != 0);
1590
+
1591
+ void * ptr = reinterpret_cast<void *>(pool_addr + pool_used);
1592
+ *actual_size = size;
1593
+ pool_used += size;
1594
+
1595
+ #ifdef DEBUG_SYCL_MALLOC
1596
+ GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr);
1597
+ #endif
1598
+
1599
+ return ptr;
1600
+ }
1601
+
1602
+ void free(void * ptr, size_t size) override {
1603
+ #ifdef DEBUG_SYCL_MALLOC
1604
+ GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr);
1605
+ #endif
1606
+
1607
+ pool_used -= size;
1608
+
1609
+ // all deallocations must be in reverse order of the allocations
1610
+ GGML_ASSERT(ptr == reinterpret_cast<void *>(pool_addr + pool_used));
1611
+ }
1612
+ };
1613
+ #endif // defined(GGML_SYCL_USE_VMM)
1614
+
1349
1615
  struct ggml_sycl_pool_host : public ggml_sycl_pool {
1350
1616
  queue_ptr qptr;
1351
1617
  int device;
@@ -1426,15 +1692,18 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(que
1426
1692
  }
1427
1693
 
1428
1694
  std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
1429
- // TBD: NO VMM support
1430
- // if (ggml_sycl_info().devices[device].vmm) {
1431
- // return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
1432
- // }
1433
- return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
1695
+ #if defined(GGML_SYCL_USE_VMM)
1696
+ if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) {
1697
+ return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(qptr, device));
1698
+ }
1699
+ #endif // defined(GGML_SYCL_USE_VMM)
1700
+ return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
1434
1701
  }
1435
1702
 
1436
- // TBD pool with virtual memory management
1437
- // struct ggml_sycl_pool_vmm : public ggml_sycl_pool
1703
+
1704
+ std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
1705
+ return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
1706
+ }
1438
1707
 
1439
1708
  /// kernels
1440
1709
  typedef void (*ggml_sycl_op_mul_mat_t)(
@@ -2156,6 +2425,31 @@ inline void ggml_sycl_op_mul_mat_sycl(
2156
2425
  #else
2157
2426
  bool use_fp16 = false;
2158
2427
  #endif
2428
+
2429
+ #if GGML_SYCL_DNNL && defined(GGML_SYCL_HAS_BF16)
2430
+ // Fast path for bf16 src0
2431
+ if (src0->type == GGML_TYPE_BF16 && !g_ggml_sycl_disable_dnn && ggml_is_contiguous(src0) &&
2432
+ row_diff == src0->ne[1]) {
2433
+ using bf16_t = sycl::ext::oneapi::bfloat16;
2434
+ ggml_sycl_pool_alloc<bf16_t> src1_as_bf16(ctx.pool(), src1_ncols*ne10);
2435
+ if (src1->type != GGML_TYPE_BF16) {
2436
+ const to_bf16_sycl_t to_bf16_sycl = ggml_get_to_bf16_sycl(src1->type, dst);
2437
+ GGML_ASSERT(to_bf16_sycl != nullptr);
2438
+ to_bf16_sycl(src1_ddf_i, src1_as_bf16.get(), src1_ncols*ne10, stream);
2439
+ } else {
2440
+ stream->memcpy(src1_as_bf16.get(), src1_ddf_i, src1_ncols*ne10*sizeof(bf16_t));
2441
+ }
2442
+ DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10,
2443
+ src0_dd_i, DnnlGemmWrapper::to_dt<bf16_t>(),
2444
+ src1_as_bf16.get(), DnnlGemmWrapper::to_dt<bf16_t>(),
2445
+ dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2446
+ GGML_UNUSED(dst);
2447
+ GGML_UNUSED(src1_ddq_i);
2448
+ GGML_UNUSED(src1_padded_row_size);
2449
+ return;
2450
+ }
2451
+ #endif
2452
+
2159
2453
  if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
2160
2454
  row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
2161
2455
  ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
@@ -2233,21 +2527,25 @@ inline void ggml_sycl_op_mul_mat_sycl(
2233
2527
  const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
2234
2528
  const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
2235
2529
 
2530
+ {
2531
+ const int64_t gemm_flops = (int64_t)row_diff * src1_ncols * ne10;
2532
+ const bool use_mkl_direct = gemm_flops < 256 * 256 * 256;
2236
2533
  #if GGML_SYCL_DNNL
2237
- if (!g_ggml_sycl_disable_dnn) {
2238
- DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
2239
- DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
2240
- dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2241
- }
2242
- else
2534
+ if (!g_ggml_sycl_disable_dnn && !use_mkl_direct) {
2535
+ DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
2536
+ DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
2537
+ dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2538
+ }
2539
+ else
2243
2540
  #endif
2244
- {
2245
- const float alpha = 1.0f;
2246
- const float beta = 0.0f;
2247
- SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
2248
- *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
2249
- src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
2250
- dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
2541
+ {
2542
+ const float alpha = 1.0f;
2543
+ const float beta = 0.0f;
2544
+ SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
2545
+ *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
2546
+ src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
2547
+ dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
2548
+ }
2251
2549
  }
2252
2550
  }
2253
2551
  GGML_UNUSED(dst);
@@ -3249,8 +3547,11 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
3249
3547
  inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
3250
3548
  switch (type) {
3251
3549
  case GGML_TYPE_Q4_0:
3550
+ case GGML_TYPE_Q8_0:
3252
3551
  return true;
3552
+ case GGML_TYPE_Q3_K:
3253
3553
  case GGML_TYPE_Q4_K:
3554
+ case GGML_TYPE_Q5_K:
3254
3555
  case GGML_TYPE_Q6_K:
3255
3556
  return !g_ggml_sycl_prioritize_dmmv;
3256
3557
  default:
@@ -3261,6 +3562,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
3261
3562
  inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
3262
3563
  switch (type) {
3263
3564
  case GGML_TYPE_Q4_0:
3565
+ case GGML_TYPE_Q8_0:
3264
3566
  return true;
3265
3567
  default:
3266
3568
  return false;
@@ -3270,7 +3572,10 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
3270
3572
  inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
3271
3573
  switch (type) {
3272
3574
  case GGML_TYPE_Q4_0:
3575
+ case GGML_TYPE_Q8_0:
3576
+ case GGML_TYPE_Q3_K:
3273
3577
  case GGML_TYPE_Q4_K:
3578
+ case GGML_TYPE_Q5_K:
3274
3579
  case GGML_TYPE_Q6_K:
3275
3580
  return true;
3276
3581
  default:
@@ -3291,6 +3596,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
3291
3596
  case GGML_TYPE_Q5_K:
3292
3597
  case GGML_TYPE_Q6_K:
3293
3598
  case GGML_TYPE_F16:
3599
+ case GGML_TYPE_BF16:
3294
3600
  return true;
3295
3601
  default:
3296
3602
  return false;
@@ -3308,7 +3614,7 @@ static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size)
3308
3614
  // If async allocation extension is not available, use_async should always be false.
3309
3615
  GGML_ASSERT(!use_async);
3310
3616
  #endif
3311
- return sycl::malloc(size, *stream, sycl::usm::alloc::device);
3617
+ return ggml_sycl_malloc_device(size, *stream);
3312
3618
  }
3313
3619
 
3314
3620
  static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
@@ -3322,12 +3628,58 @@ static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
3322
3628
  // If async allocation extension is not available, use_async should always be false.
3323
3629
  GGML_ASSERT(!use_async);
3324
3630
  #endif
3325
- sycl::free(ptr, *stream);
3631
+ ggml_sycl_free_device(ptr, *stream);
3326
3632
  }
3327
3633
 
3328
- static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
3634
+ // RAII wrapper for temporary reorder buffers with optional host memory fallback.
3635
+ // When device allocation fails and GGML_SYCL_HOST_MEM_FALLBACK is enabled,
3636
+ // falls back to host memory so the reorder kernel can still run (over PCIe).
3637
+ // Device access to host memory requires Linux kernel 6.8+ (Ubuntu 26.04+).
3638
+ struct sycl_reorder_temp_buffer {
3639
+ void * ptr = nullptr;
3640
+ dpct::queue_ptr stream;
3641
+
3642
+ sycl_reorder_temp_buffer(dpct::queue_ptr stream, size_t size) : stream(stream) {
3643
+ ptr = sycl_ext_malloc_device(stream, size);
3644
+ #ifdef GGML_SYCL_HOST_MEM_FALLBACK
3645
+ if (!ptr) {
3646
+ ptr = sycl::malloc_host(size, *stream);
3647
+ if (ptr) {
3648
+ host_fallback = true;
3649
+ GGML_LOG_WARN("%s: device alloc of %zu bytes failed, using host memory fallback\n", __func__, size);
3650
+ }
3651
+ }
3652
+ #endif
3653
+ }
3654
+
3655
+ ~sycl_reorder_temp_buffer() {
3656
+ if (!ptr) {
3657
+ return;
3658
+ }
3659
+ if (host_fallback) {
3660
+ sycl::free(ptr, *stream);
3661
+ } else {
3662
+ sycl_ext_free(stream, ptr);
3663
+ }
3664
+ }
3665
+
3666
+ explicit operator bool() const { return ptr != nullptr; }
3667
+
3668
+ sycl_reorder_temp_buffer(const sycl_reorder_temp_buffer &) = delete;
3669
+ sycl_reorder_temp_buffer & operator=(const sycl_reorder_temp_buffer &) = delete;
3670
+
3671
+ private:
3672
+ bool host_fallback = false;
3673
+ };
3674
+
3675
+ static bool reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
3329
3676
  dpct::queue_ptr stream) {
3330
- uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
3677
+ sycl_reorder_temp_buffer tmp(stream, size);
3678
+ if (!tmp) {
3679
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3680
+ return false;
3681
+ }
3682
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3331
3683
 
3332
3684
  sycl::event copy_event;
3333
3685
  SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
@@ -3356,16 +3708,60 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
3356
3708
  if (!g_ggml_sycl_use_async_mem_op) {
3357
3709
  reorder_event.wait_and_throw();
3358
3710
  }
3359
- sycl_ext_free(stream, tmp_buf);
3711
+ return true;
3712
+ }
3713
+
3714
+ static bool reorder_qw_q8_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
3715
+ dpct::queue_ptr stream) {
3716
+ sycl_reorder_temp_buffer tmp(stream, size);
3717
+ if (!tmp) {
3718
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3719
+ return false;
3720
+ }
3721
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3722
+
3723
+ sycl::event copy_event;
3724
+ SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
3725
+ if (!g_ggml_sycl_use_async_mem_op) {
3726
+ copy_event.wait();
3727
+ }
3728
+
3729
+ GGML_ASSERT((size % sizeof(block_q8_0) == 0));
3730
+ GGML_ASSERT((offset % sizeof(block_q8_0) == 0));
3731
+ int offset_blks = offset / sizeof(block_q8_0);
3732
+ auto qs_ptr = data_device + offset_blks * QK8_0;
3733
+ auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows) + offset_blks;
3734
+
3735
+ auto reorder_event = stream->parallel_for(
3736
+ size / sizeof(block_q8_0),
3737
+ [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
3738
+ const block_q8_0* x = (const block_q8_0*)tmp_buf;
3739
+ const int ib = i;
3740
+
3741
+ for (int j = 0; j < QK8_0; j++)
3742
+ {
3743
+ *((int8_t*)qs_ptr + ib * QK8_0 + j) = x[ib].qs[j];
3744
+ }
3745
+ *(d_ptr + ib) = x[ib].d;
3746
+ });
3747
+ if (!g_ggml_sycl_use_async_mem_op) {
3748
+ reorder_event.wait_and_throw();
3749
+ }
3750
+ return true;
3360
3751
  }
3361
3752
 
3362
- static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3753
+ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3363
3754
  GGML_ASSERT(size % sizeof(block_q4_K) == 0);
3364
3755
  GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
3365
3756
 
3366
3757
  const int nblocks = size / sizeof(block_q4_K);
3367
3758
 
3368
- uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
3759
+ sycl_reorder_temp_buffer tmp(stream, size);
3760
+ if (!tmp) {
3761
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3762
+ return false;
3763
+ }
3764
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3369
3765
 
3370
3766
  sycl::event copy_event;
3371
3767
  SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
@@ -3394,16 +3790,117 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
3394
3790
  if (!g_ggml_sycl_use_async_mem_op) {
3395
3791
  reorder_event.wait_and_throw();
3396
3792
  }
3397
- sycl_ext_free(stream, tmp_buf);
3793
+ return true;
3794
+ }
3795
+
3796
+ static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3797
+ GGML_ASSERT(size % sizeof(block_q3_K) == 0);
3798
+ GGML_ASSERT(offset % sizeof(block_q3_K) == 0);
3799
+
3800
+ const int nblocks = size / sizeof(block_q3_K);
3801
+
3802
+ sycl_reorder_temp_buffer tmp(stream, size);
3803
+ if (!tmp) {
3804
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3805
+ return false;
3806
+ }
3807
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3808
+
3809
+ sycl::event copy_event;
3810
+ SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
3811
+ if (!g_ggml_sycl_use_async_mem_op) {
3812
+ copy_event.wait();
3813
+ }
3814
+
3815
+ auto * qs_ptr = data_device;
3816
+ auto * hmask_ptr = qs_ptr + (QK_K / 4) * nblocks;
3817
+ auto * scales_ptr = hmask_ptr + (QK_K / 8) * nblocks;
3818
+ sycl::half * d_ptr = (sycl::half *) (scales_ptr + 12 * nblocks);
3819
+
3820
+ auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
3821
+ const block_q3_K * x = (const block_q3_K *) tmp_buf;
3822
+ const int ib = i;
3823
+
3824
+ for (int j = 0; j < QK_K / 4; ++j) {
3825
+ qs_ptr[ib * (QK_K / 4) + j] = x[ib].qs[j];
3826
+ }
3827
+
3828
+ for (int j = 0; j < QK_K / 8; ++j) {
3829
+ hmask_ptr[ib * (QK_K / 8) + j] = x[ib].hmask[j];
3830
+ }
3831
+
3832
+ for (int j = 0; j < 12; ++j) {
3833
+ scales_ptr[ib * 12 + j] = x[ib].scales[j];
3834
+ }
3835
+
3836
+ d_ptr[ib] = x[ib].d;
3837
+ });
3838
+ if (!g_ggml_sycl_use_async_mem_op) {
3839
+ reorder_event.wait_and_throw();
3840
+ }
3841
+ return true;
3398
3842
  }
3399
3843
 
3400
- static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3844
+ static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3845
+ GGML_ASSERT(size % sizeof(block_q5_K) == 0);
3846
+ GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
3847
+
3848
+ const int nblocks = size / sizeof(block_q5_K);
3849
+
3850
+ sycl_reorder_temp_buffer tmp(stream, size);
3851
+ if (!tmp) {
3852
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3853
+ return false;
3854
+ }
3855
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3856
+
3857
+ sycl::event copy_event;
3858
+ SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
3859
+ if (!g_ggml_sycl_use_async_mem_op) {
3860
+ copy_event.wait();
3861
+ }
3862
+
3863
+ auto * qs_ptr = data_device;
3864
+ auto * qh_ptr = qs_ptr + (QK_K / 2) * nblocks;
3865
+ auto * scales_ptr = qh_ptr + (QK_K / 8) * nblocks;
3866
+ auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
3867
+
3868
+ auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
3869
+ const block_q5_K * x = (const block_q5_K *) tmp_buf;
3870
+ const int ib = i;
3871
+
3872
+ for (int j = 0; j < QK_K / 2; ++j) {
3873
+ qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
3874
+ }
3875
+
3876
+ for (int j = 0; j < QK_K / 8; ++j) {
3877
+ qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
3878
+ }
3879
+
3880
+ for (int j = 0; j < K_SCALE_SIZE; ++j) {
3881
+ scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
3882
+ }
3883
+
3884
+ dm_ptr[ib] = x[ib].dm;
3885
+ });
3886
+ if (!g_ggml_sycl_use_async_mem_op) {
3887
+ reorder_event.wait_and_throw();
3888
+ }
3889
+ return true;
3890
+ }
3891
+
3892
+ static bool reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3401
3893
  GGML_ASSERT(size % sizeof(block_q6_K) == 0);
3402
3894
  GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
3403
3895
 
3404
3896
  const int nblocks = size / sizeof(block_q6_K);
3405
3897
 
3406
- uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
3898
+ sycl_reorder_temp_buffer tmp(stream, size);
3899
+ if (!tmp) {
3900
+ GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3901
+ return false;
3902
+ }
3903
+ uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3407
3904
 
3408
3905
  sycl::event copy_event;
3409
3906
  SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
@@ -3442,10 +3939,10 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
3442
3939
  if (!g_ggml_sycl_use_async_mem_op) {
3443
3940
  reorder_event.wait_and_throw();
3444
3941
  }
3445
- sycl_ext_free(stream, tmp_buf);
3942
+ return true;
3446
3943
  }
3447
3944
 
3448
- static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
3945
+ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
3449
3946
  uint8_t * data_device = (uint8_t *) src0->data;
3450
3947
  size_t ncols = src0->ne[0];
3451
3948
  size_t nrows = src0->ne[1];
@@ -3453,17 +3950,20 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
3453
3950
 
3454
3951
  switch (src0->type) {
3455
3952
  case GGML_TYPE_Q4_0:
3456
- reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
3457
- break;
3953
+ return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
3954
+ case GGML_TYPE_Q8_0:
3955
+ return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
3956
+ case GGML_TYPE_Q3_K:
3957
+ return reorder_qw_q3_k(data_device, size, 0, stream);
3458
3958
  case GGML_TYPE_Q4_K:
3459
- reorder_qw_q4_k(data_device, size, 0, stream);
3460
- break;
3959
+ return reorder_qw_q4_k(data_device, size, 0, stream);
3960
+ case GGML_TYPE_Q5_K:
3961
+ return reorder_qw_q5_k(data_device, size, 0, stream);
3461
3962
  case GGML_TYPE_Q6_K:
3462
- reorder_qw_q6_k(data_device, size, 0, stream);
3463
- break;
3963
+ return reorder_qw_q6_k(data_device, size, 0, stream);
3464
3964
  default:
3465
3965
  GGML_ABORT("reorder_qw() called with unsupported type");
3466
- break;
3966
+ return false;
3467
3967
  }
3468
3968
  }
3469
3969
 
@@ -3471,7 +3971,9 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten
3471
3971
  return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
3472
3972
  ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
3473
3973
  dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
3474
- dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
3974
+ // ne[1] <= 8 so multi-column decode (spec / MTP verify) also bootstraps the reorder;
3975
+ // all reorderable types have a _switch_ncols kernel.
3976
+ dst->src[1]->ne[1] <= 8 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
3475
3977
  }
3476
3978
 
3477
3979
  static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
@@ -3503,14 +4005,20 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
3503
4005
  break;
3504
4006
  }
3505
4007
 
3506
- reorder_qw(src0, ctx->stream());
3507
- extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
4008
+ if (reorder_qw(src0, ctx->stream())) {
4009
+ extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
4010
+ }
3508
4011
  }
3509
4012
 
3510
4013
 
3511
4014
  static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4015
+ // The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
4016
+ // a multiple of 2*DMMV_X. Quantized types use block-structured kernels that only
4017
+ // need ne[0] % DMMV_X == 0.
4018
+ const int64_t dmmv_x_required = (src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F16) ?
4019
+ 2*GGML_SYCL_DMMV_X : GGML_SYCL_DMMV_X;
3512
4020
  return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
3513
- src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
4021
+ src0->ne[0] % dmmv_x_required == 0 && src1->ne[1] == 1;
3514
4022
  }
3515
4023
 
3516
4024
  static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3560,9 +4068,16 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
3560
4068
  // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
3561
4069
  // is enabled takes precedence over DMMV, the current if-else implementation
3562
4070
  // requires disabling DMMV if both conditions are met
4071
+
3563
4072
  if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) &&
3564
4073
  ggml_sycl_supports_reorder_mmvq(src0->type)))) {
3565
- use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
4074
+ // Arc770 get benefit with Q4_0 by skipping it.
4075
+ if (!(ggml_sycl_info().devices[ctx.device].hw_info.arch ==
4076
+ gpu_arch::intel_gpu_acm_g10 &&
4077
+ src0->type == GGML_TYPE_Q4_0)) {
4078
+ use_dequantize_mul_mat_vec =
4079
+ use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
4080
+ }
3566
4081
  }
3567
4082
 
3568
4083
  if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
@@ -3607,35 +4122,17 @@ struct mmid_row_mapping {
3607
4122
 
3608
4123
  __dpct_inline__ static void k_copy_src1_to_contiguous(
3609
4124
  const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
3610
- int *__restrict__ cur_src1_row, mmid_row_mapping *__restrict__ row_mapping,
3611
- const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
4125
+ const mmid_row_mapping *__restrict__ row_mapping,
3612
4126
  int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
3613
- const sycl::nd_item<3> &item_ct1, int &src1_row) {
3614
- int32_t iid1 = item_ct1.get_group(2);
3615
- int32_t id = item_ct1.get_group(1);
4127
+ const sycl::nd_item<3> &item_ct1) {
4128
+ const int32_t src1_row = item_ct1.get_group(2);
3616
4129
 
3617
- const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
3618
-
3619
- if (row_id_i != i02) {
3620
- return;
3621
- }
4130
+ const int32_t iid1 = row_mapping[src1_row].i2;
4131
+ const int32_t id = row_mapping[src1_row].i1;
3622
4132
 
3623
4133
  const int64_t i11 = id % ne11;
3624
4134
  const int64_t i12 = iid1;
3625
4135
 
3626
- if (item_ct1.get_local_id(2) == 0) {
3627
- src1_row =
3628
- dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
3629
- cur_src1_row, 1);
3630
- row_mapping[src1_row] = {id, iid1};
3631
- }
3632
- /*
3633
- DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
3634
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
3635
- performance if there is no access to global memory.
3636
- */
3637
- item_ct1.barrier();
3638
-
3639
4136
  const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
3640
4137
  float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
3641
4138
 
@@ -3665,6 +4162,92 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
3665
4162
  }
3666
4163
  }
3667
4164
 
4165
+ // Fused MoE TG fast path. Returns false to fall back to the per-expert loop below.
4166
+ static bool ggml_sycl_mul_mat_id_mmvq_fused(
4167
+ ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
4168
+ const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
4169
+ {
4170
+ const int64_t ne10 = src1->ne[0];
4171
+ const int64_t ne11 = src1->ne[1];
4172
+ const int64_t ne12 = src1->ne[2];
4173
+ if (ne12 != 1) return false;
4174
+ if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) return false;
4175
+ if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
4176
+ if (!ggml_is_contiguous(src1)) return false;
4177
+
4178
+ // Reorder layout not supported; fall back.
4179
+ const ggml_tensor_extra_gpu * src0_extra =
4180
+ static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
4181
+ if (src0_extra && src0_extra->optimized_feature.reorder) return false;
4182
+
4183
+ const int64_t n_ids_per_group = ids->ne[0];
4184
+ if (ids->ne[1] != 1) return false;
4185
+ if (ne11 != 1 && ne11 != n_ids_per_group) return false;
4186
+
4187
+ const queue_ptr stream = ctx.stream();
4188
+ const int src1_padded_cols = GGML_PAD((int) ne10, MATRIX_ROW_PADDING);
4189
+ const int n_experts_used = (int) n_ids_per_group;
4190
+ const int nrows = (int) src0->ne[1];
4191
+
4192
+ ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
4193
+ (size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
4194
+ char * src1_ddq = src1_q8_alloc.get();
4195
+ quantize_row_q8_1_sycl<quantize_q8_1>(
4196
+ (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
4197
+ src1_padded_cols, stream);
4198
+
4199
+ const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
4200
+ const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
4201
+
4202
+ return ggml_sycl_mul_mat_vec_q_id(
4203
+ src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
4204
+ (float *) dst->data, (int) ne10, nrows, n_experts_used,
4205
+ /*expert_weight_stride=*/ src0->nb[2],
4206
+ /*dst_row_stride=*/ dst->nb[1],
4207
+ src1_row_stride, stream);
4208
+ }
4209
+
4210
+ // counting sort of the routed rows by expert id (row_id_i, as chosen by the router):
4211
+ // builds a projection of a memory layout where each expert's slice is contiguous
4212
+ static void mmid_counting_sort_rows(
4213
+ const ggml_tensor * ids, const char * ids_host,
4214
+ int64_t n_ids, int64_t n_as, int64_t n_routed_rows,
4215
+ std::vector<int64_t> & expert_counts,
4216
+ std::vector<int64_t> & expert_row_offsets,
4217
+ std::vector<mmid_row_mapping> & routed_row_src) {
4218
+
4219
+ // frequencies: how many routed rows each expert "owns"
4220
+ expert_counts.assign(n_as, 0);
4221
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
4222
+ for (int64_t id = 0; id < n_ids; id++) {
4223
+ const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
4224
+ GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
4225
+ expert_counts[row_id_i]++;
4226
+ }
4227
+ }
4228
+
4229
+ // where each expert's slice starts (row indices) and the previous ends
4230
+ expert_row_offsets.assign(n_as + 1, 0);
4231
+ for (int64_t i02 = 0; i02 < n_as; i02++) {
4232
+ expert_row_offsets[i02 + 1] = expert_row_offsets[i02] + expert_counts[i02];
4233
+ }
4234
+
4235
+ std::vector<int64_t> expert_row_next = expert_row_offsets;
4236
+ routed_row_src.resize(n_routed_rows);
4237
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
4238
+ for (int64_t id = 0; id < n_ids; id++) {
4239
+ const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
4240
+ GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
4241
+
4242
+ // find and validate the next free row for a given expert (row_id_i)
4243
+ const int64_t routed_row = expert_row_next[row_id_i]++;
4244
+ GGML_ASSERT(routed_row >= expert_row_offsets[row_id_i]);
4245
+ GGML_ASSERT(routed_row < expert_row_offsets[row_id_i + 1]);
4246
+ routed_row_src[routed_row] = {(int32_t) id, (int32_t) iid1};
4247
+ }
4248
+ }
4249
+ }
4250
+
3668
4251
  static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3669
4252
  ggml_tensor *dst) try {
3670
4253
  scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
@@ -3680,6 +4263,12 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3680
4263
  const int64_t n_as = ne02;
3681
4264
  const int64_t n_ids = ids->ne[0];
3682
4265
 
4266
+ if (ne12 == 1) {
4267
+ if (ggml_sycl_mul_mat_id_mmvq_fused(ctx, src0, src1, ids, dst)) {
4268
+ return;
4269
+ }
4270
+ }
4271
+
3683
4272
  std::vector<char> ids_host(ggml_nbytes(ids));
3684
4273
  const char * ids_dev = (const char *) ids->data;
3685
4274
 
@@ -3730,105 +4319,98 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3730
4319
  }
3731
4320
  }
3732
4321
  } else {
3733
- ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
3734
- ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
4322
+ const int64_t n_routed_rows = ids->ne[1] * n_ids;
4323
+ ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne10);
4324
+ ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*n_routed_rows*ne0);
3735
4325
 
3736
4326
  src1_row.data = src1_contiguous.get();
3737
4327
  dst_row.data = dst_contiguous.get();
3738
4328
 
3739
- for (int64_t i02 = 0; i02 < n_as; i02++) {
3740
- int64_t num_src1_rows = 0;
3741
- for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
3742
- for (int64_t id = 0; id < n_ids; id++) {
3743
- const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
4329
+ // how many "owned" routed rows to pass to each expert
4330
+ std::vector<int64_t> expert_row_counts;
4331
+ // where each expert's slice starts and the previous ends (row indices, right-exclusive)
4332
+ std::vector<int64_t> expert_row_offsets;
4333
+ // the sources (slot/token pairs) of contiguous rows to guide k_copy_src1_to_contiguous
4334
+ std::vector<mmid_row_mapping> routed_row_src;
3744
4335
 
3745
- GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
4336
+ mmid_counting_sort_rows(ids, ids_host.data(), n_ids, n_as, n_routed_rows,
4337
+ expert_row_counts, expert_row_offsets, routed_row_src);
3746
4338
 
3747
- if (row_id_i != i02) {
3748
- continue;
3749
- }
4339
+ ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), n_routed_rows);
4340
+ SYCL_CHECK(CHECK_TRY_ERROR(
4341
+ stream->memcpy(dev_row_mapping.get(), routed_row_src.data(), n_routed_rows*sizeof(mmid_row_mapping))));
3750
4342
 
3751
- num_src1_rows++;
3752
- }
3753
- }
4343
+ const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
4344
+ assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
4345
+
4346
+ {
4347
+ sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
4348
+ sycl::range<3> grid_dims(1, 1, n_routed_rows);
4349
+ stream->submit([&](sycl::handler &cgh) {
4350
+ char *__restrict src1_contiguous_get =
4351
+ src1_contiguous.get();
4352
+ mmid_row_mapping *__restrict dev_row_mapping_get =
4353
+ dev_row_mapping.get();
4354
+
4355
+ cgh.parallel_for(
4356
+ sycl::nd_range<3>(grid_dims * block_dims, block_dims),
4357
+ [=](sycl::nd_item<3> item_ct1) {
4358
+ k_copy_src1_to_contiguous(
4359
+ src1_original, src1_contiguous_get,
4360
+ dev_row_mapping_get,
4361
+ ne11, ne10, nb11, nb12,
4362
+ item_ct1);
4363
+ });
4364
+ });
4365
+ }
4366
+
4367
+ for (int64_t i02 = 0; i02 < n_as; i02++) {
4368
+ const int64_t num_src1_rows = expert_row_counts[i02];
3754
4369
 
3755
4370
  if (num_src1_rows == 0) {
3756
4371
  continue;
3757
4372
  }
3758
4373
 
3759
-
3760
- ggml_sycl_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
3761
- ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
3762
- SYCL_CHECK(CHECK_TRY_ERROR(
3763
- stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
3764
-
3765
- const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
3766
- assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
3767
-
3768
- {
3769
- sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
3770
- sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
3771
- stream->submit([&](sycl::handler &cgh) {
3772
- sycl::local_accessor<int, 0> src1_row_acc(cgh);
3773
-
3774
- char *__restrict src1_contiguous_get =
3775
- src1_contiguous.get();
3776
- int *__restrict dev_cur_src1_row_get =
3777
- dev_cur_src1_row.get();
3778
- mmid_row_mapping *__restrict dev_row_mapping_get =
3779
- dev_row_mapping.get();
3780
- size_t ids_nb_ct6 = ids->nb[1];
3781
- size_t ids_nb_ct7 = ids->nb[0];
3782
-
3783
- cgh.parallel_for(
3784
- sycl::nd_range<3>(grid_dims * block_dims, block_dims),
3785
- [=](sycl::nd_item<3> item_ct1) {
3786
- k_copy_src1_to_contiguous(
3787
- src1_original, src1_contiguous_get,
3788
- dev_cur_src1_row_get,
3789
- dev_row_mapping_get, ids_dev, i02,
3790
- ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
3791
- item_ct1, src1_row_acc);
3792
- });
3793
- });
3794
- }
4374
+ const int64_t expert_row_offset = expert_row_offsets[i02];
3795
4375
 
3796
4376
  src0_row.data = src0_original + i02*nb02;
3797
4377
 
3798
4378
  GGML_ASSERT(nb11 == sizeof(float)*ne10);
3799
4379
  GGML_ASSERT(nb1 == sizeof(float)*ne0);
4380
+ src1_row.data = src1_contiguous.get() + expert_row_offset*nb11;
3800
4381
  src1_row.ne[1] = num_src1_rows;
3801
4382
 
3802
4383
  src1_row.nb[1] = nb11;
3803
4384
  src1_row.nb[2] = num_src1_rows*nb11;
3804
4385
  src1_row.nb[3] = num_src1_rows*nb11;
3805
4386
 
4387
+ dst_row.data = dst_contiguous.get() + expert_row_offset*nb1;
3806
4388
  dst_row.ne[1] = num_src1_rows;
3807
4389
  dst_row.nb[1] = nb1;
3808
4390
  dst_row.nb[2] = num_src1_rows*nb1;
3809
4391
  dst_row.nb[3] = num_src1_rows*nb1;
3810
4392
 
3811
4393
  ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
4394
+ }
3812
4395
 
3813
- {
3814
- sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
3815
- sycl::range<3> grid_dims(1, 1, num_src1_rows);
3816
- stream->submit([&](sycl::handler &cgh) {
3817
- const char *__restrict dst_contiguous_get =
3818
- dst_contiguous.get();
3819
- const mmid_row_mapping *__restrict dev_row_mapping_get =
3820
- dev_row_mapping.get();
3821
-
3822
- cgh.parallel_for(
3823
- sycl::nd_range<3>(grid_dims * block_dims, block_dims),
3824
- [=](sycl::nd_item<3> item_ct1) {
3825
- k_copy_dst_from_contiguous(dst_original,
3826
- dst_contiguous_get,
3827
- dev_row_mapping_get,
3828
- ne0, nb1, nb2, item_ct1);
3829
- });
3830
- });
3831
- }
4396
+ {
4397
+ sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
4398
+ sycl::range<3> grid_dims(1, 1, n_routed_rows);
4399
+ stream->submit([&](sycl::handler &cgh) {
4400
+ const char *__restrict dst_contiguous_get =
4401
+ dst_contiguous.get();
4402
+ const mmid_row_mapping *__restrict dev_row_mapping_get =
4403
+ dev_row_mapping.get();
4404
+
4405
+ cgh.parallel_for(
4406
+ sycl::nd_range<3>(grid_dims * block_dims, block_dims),
4407
+ [=](sycl::nd_item<3> item_ct1) {
4408
+ k_copy_dst_from_contiguous(dst_original,
4409
+ dst_contiguous_get,
4410
+ dev_row_mapping_get,
4411
+ ne0, nb1, nb2, item_ct1);
4412
+ });
4413
+ });
3832
4414
  }
3833
4415
  }
3834
4416
  }
@@ -3858,6 +4440,11 @@ static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
3858
4440
  ggml_sycl_op_im2col(ctx, dst);
3859
4441
  }
3860
4442
 
4443
+ static void ggml_sycl_im2col_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
4444
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
4445
+ ggml_sycl_op_im2col_3d(ctx, dst);
4446
+ }
4447
+
3861
4448
  static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3862
4449
  scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3863
4450
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
@@ -4155,6 +4742,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
4155
4742
  case GGML_OP_IM2COL:
4156
4743
  ggml_sycl_im2col(ctx, dst);
4157
4744
  break;
4745
+ case GGML_OP_IM2COL_3D:
4746
+ ggml_sycl_im2col_3d(ctx, dst);
4747
+ break;
4158
4748
  case GGML_OP_POOL_2D:
4159
4749
  ggml_sycl_pool2d(ctx, dst);
4160
4750
  break;
@@ -4191,6 +4781,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
4191
4781
  case GGML_OP_SSM_CONV:
4192
4782
  ggml_sycl_ssm_conv(ctx, dst);
4193
4783
  break;
4784
+ case GGML_OP_SSM_SCAN:
4785
+ ggml_sycl_ssm_scan(ctx, dst);
4786
+ break;
4787
+ case GGML_OP_FILL:
4788
+ ggml_sycl_fill(ctx, dst);
4789
+ break;
4790
+ case GGML_OP_CUMSUM:
4791
+ ggml_sycl_cumsum(ctx, dst);
4792
+ break;
4793
+ case GGML_OP_DIAG:
4794
+ ggml_sycl_diag(ctx, dst);
4795
+ break;
4796
+ case GGML_OP_SOLVE_TRI:
4797
+ ggml_sycl_solve_tri(ctx, dst);
4798
+ break;
4194
4799
  case GGML_OP_ROLL:
4195
4800
  ggml_sycl_roll(ctx, dst);
4196
4801
  break;
@@ -4497,6 +5102,8 @@ static ggml_backend_i ggml_backend_sycl_interface = {
4497
5102
  /* .free = */ ggml_backend_sycl_free,
4498
5103
  /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
4499
5104
  /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
5105
+ /* .set_tensor_2d_async = */ NULL,
5106
+ /* .get_tensor_2d_async = */ NULL,
4500
5107
  /* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
4501
5108
  // // TODO: update for the new
4502
5109
  // interface
@@ -4665,26 +5272,19 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4665
5272
  struct ggml_tensor * a = op->src[0];
4666
5273
  struct ggml_tensor * b = op->src[1];
4667
5274
 
4668
- if (a->ne[3] != b->ne[3]) {
5275
+ // disable Q1_0 until implementation
5276
+ if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) {
4669
5277
  return false;
4670
5278
  }
4671
- ggml_type a_type = a->type;
4672
- if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS ||
4673
- a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
4674
- a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S ||
4675
- a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M
4676
- ) {
4677
- if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
4678
- return false;
4679
- }
4680
- }
4681
- ggml_type src0_type = op->src[0]->type;
4682
- if (src0_type == GGML_TYPE_BF16 ) {
4683
- // TODO: support GGML_TYPE_BF16
4684
- // FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
5279
+
5280
+ if (a->ne[3] != b->ne[3]) {
4685
5281
  return false;
4686
5282
  }
4687
5283
 
5284
+ ggml_type src0_type = op->src[0]->type;
5285
+
5286
+
5287
+
4688
5288
  // TODO: The configuration below needs more work to be supported with oneDNN
4689
5289
  if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
4690
5290
  a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
@@ -4703,12 +5303,31 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4703
5303
  case GGML_OP_GET_ROWS:
4704
5304
  {
4705
5305
  switch (op->src[0]->type) {
5306
+ case GGML_TYPE_I32:
4706
5307
  case GGML_TYPE_F16:
5308
+ case GGML_TYPE_BF16:
4707
5309
  case GGML_TYPE_F32:
5310
+ case GGML_TYPE_Q1_0:
5311
+ case GGML_TYPE_MXFP4:
5312
+ case GGML_TYPE_NVFP4:
5313
+ case GGML_TYPE_IQ2_XXS:
5314
+ case GGML_TYPE_IQ2_XS:
5315
+ case GGML_TYPE_IQ2_S:
5316
+ case GGML_TYPE_IQ3_XXS:
5317
+ case GGML_TYPE_IQ1_S:
5318
+ case GGML_TYPE_IQ1_M:
5319
+ case GGML_TYPE_IQ3_S:
5320
+ case GGML_TYPE_IQ4_NL:
5321
+ case GGML_TYPE_IQ4_XS:
5322
+ case GGML_TYPE_Q2_K:
5323
+ case GGML_TYPE_Q3_K:
4708
5324
  case GGML_TYPE_Q4_0:
4709
5325
  case GGML_TYPE_Q4_1:
5326
+ case GGML_TYPE_Q4_K:
4710
5327
  case GGML_TYPE_Q5_0:
4711
5328
  case GGML_TYPE_Q5_1:
5329
+ case GGML_TYPE_Q5_K:
5330
+ case GGML_TYPE_Q6_K:
4712
5331
  case GGML_TYPE_Q8_0:
4713
5332
  return true;
4714
5333
  default:
@@ -4863,9 +5482,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4863
5482
  case GGML_OP_ROPE:
4864
5483
  case GGML_OP_ROPE_BACK:
4865
5484
  case GGML_OP_IM2COL:
4866
- return true;
5485
+ case GGML_OP_IM2COL_3D:
4867
5486
  case GGML_OP_UPSCALE:
4868
- return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
5487
+ return true;
4869
5488
  case GGML_OP_SUM:
4870
5489
  case GGML_OP_SUM_ROWS:
4871
5490
  case GGML_OP_MEAN:
@@ -4887,11 +5506,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4887
5506
  case GGML_OP_ACC:
4888
5507
  return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
4889
5508
  case GGML_OP_PAD:
4890
- // TODO: add circular padding support for syscl, see https://github.com/ggml-org/llama.cpp/pull/16985
4891
5509
  if (ggml_get_op_params_i32(op, 8) != 0) {
4892
5510
  return false;
4893
5511
  }
4894
- return ggml_is_contiguous(op->src[0]);
5512
+ return true;
4895
5513
  case GGML_OP_LEAKY_RELU:
4896
5514
  case GGML_OP_TIMESTEP_EMBEDDING:
4897
5515
  case GGML_OP_RWKV_WKV6:
@@ -4907,6 +5525,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4907
5525
  return op->type == GGML_TYPE_F32;
4908
5526
  case GGML_OP_ARANGE:
4909
5527
  return op->type == GGML_TYPE_F32;
5528
+ case GGML_OP_SSM_SCAN:
5529
+ if (op->src[3]->ne[0] == 1) {
5530
+ // Mamba2
5531
+ // (kernel only supports (d_state == 128 || d_state == 256) && d_head % WARP_SIZE == 0)
5532
+ return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % WARP_SIZE == 0;
5533
+ } else {
5534
+ // TODO Mamba-1 not yet ported to SYCL
5535
+ return false;
5536
+ }
5537
+ case GGML_OP_FILL:
5538
+ case GGML_OP_CUMSUM:
5539
+ case GGML_OP_DIAG:
5540
+ return true;
5541
+ case GGML_OP_SOLVE_TRI:
5542
+ return op->src[0]->ne[0] <= SYCL_SOLVE_TRI_MAX_N && op->src[1]->ne[0] <= SYCL_SOLVE_TRI_MAX_K;
4910
5543
  case GGML_OP_FLASH_ATTN_EXT:
4911
5544
  return ggml_sycl_flash_attn_ext_supported(device, op);
4912
5545
  default: