whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -41,8 +41,10 @@
41
41
  #include "ggml-sycl/element_wise.hpp"
42
42
  #include "ggml-sycl/presets.hpp"
43
43
  #include "ggml-sycl/gemm.hpp"
44
+ #include "ggml-sycl/set_rows.hpp"
44
45
  #include "ggml-sycl/sycl_hw.hpp"
45
46
  #include "ggml-sycl/getrows.hpp"
47
+ #include "ggml-sycl/quantize.hpp"
46
48
  #include "ggml.h"
47
49
 
48
50
  static bool g_sycl_loaded = false;
@@ -83,9 +85,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
83
85
 
84
86
  info.devices[i].cc =
85
87
  100 * prop.get_major_version() + 10 * prop.get_minor_version();
86
- info.devices[i].hw_info = get_device_hw_info(&device);
87
- info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch);
88
-
88
+ info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
89
89
  info.max_work_group_sizes[i] = prop.get_max_work_group_size();
90
90
  }
91
91
 
@@ -195,7 +195,7 @@ static void ggml_check_sycl() try {
195
195
 
196
196
  if (!initialized) {
197
197
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
198
- g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
198
+ g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
199
199
  g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
200
200
  g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
201
201
  g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
@@ -347,14 +347,15 @@ static enum ggml_status
347
347
  ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
348
348
  ggml_tensor *tensor) try {
349
349
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
350
- debug_print_tensor(": tensor=", tensor, "\n");
350
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
351
351
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
352
352
 
353
353
  if (tensor->view_src != NULL) {
354
354
  assert(tensor->view_src->buffer->buft == buffer->buft);
355
355
  return GGML_STATUS_SUCCESS;
356
356
  }
357
- if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
357
+ if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
358
+ !g_ggml_sycl_disable_optimize) {
358
359
  ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
359
360
  tensor->extra = extra;
360
361
  ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
@@ -384,7 +385,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
384
385
  const void *data, size_t offset,
385
386
  size_t size) try {
386
387
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
387
- debug_print_tensor(": tensor=", tensor);
388
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
388
389
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
389
390
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
390
391
  ggml_sycl_set_device(ctx->device);
@@ -412,7 +413,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
412
413
  void *data, size_t offset,
413
414
  size_t size) try {
414
415
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
415
- debug_print_tensor(": tensor=", tensor);
416
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
416
417
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
417
418
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
418
419
 
@@ -443,8 +444,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
443
444
  ggml_tensor *dst) try {
444
445
  bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
445
446
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
446
- debug_print_tensor(": dst=", dst);
447
- debug_print_tensor(" src=", src);
447
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
448
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
448
449
  GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
449
450
  if (is_cpy_supported) {
450
451
  ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
@@ -524,7 +525,7 @@ catch (sycl::exception const &exc) {
524
525
  static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
525
526
  size_t offset, size_t size) {
526
527
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
527
- debug_print_tensor(": tensor=", tensor);
528
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
528
529
  GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
529
530
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
530
531
  SYCL_CHECK(ggml_sycl_set_device(ctx->device));
@@ -804,7 +805,7 @@ static enum ggml_status
804
805
  ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
805
806
  ggml_tensor *tensor) try {
806
807
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
807
- debug_print_tensor(": tensor=", tensor, "\n");
808
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
808
809
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
809
810
 
810
811
  ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
@@ -890,7 +891,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
890
891
  ggml_tensor *tensor, const void *data,
891
892
  size_t offset, size_t size) try {
892
893
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
893
- debug_print_tensor(": tensor=", tensor);
894
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
894
895
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
895
896
  // split tensors must always be set in their entirety at once
896
897
  GGML_ASSERT(offset == 0);
@@ -946,7 +947,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
946
947
  const ggml_tensor *tensor, void *data,
947
948
  size_t offset, size_t size) try {
948
949
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
949
- debug_print_tensor(": tensor=", tensor);
950
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
950
951
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
951
952
  // split tensors must always be set in their entirety at once
952
953
  GGML_ASSERT(offset == 0);
@@ -1373,67 +1374,6 @@ typedef void (*ggml_sycl_op_mul_mat_t)(
1373
1374
 
1374
1375
 
1375
1376
 
1376
- template<int QUANT_BLOCK_TILE>
1377
- static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
1378
- const sycl::nd_item<3> &item_ct1) {
1379
- const int ix = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
1380
- item_ct1.get_local_id(2)) * QUANT_BLOCK_TILE;
1381
-
1382
- if (ix >= kx_padded) {
1383
- return;
1384
- }
1385
-
1386
- const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
1387
- item_ct1.get_local_id(1);
1388
-
1389
- const int i_padded = iy*kx_padded + ix;
1390
-
1391
- block_q8_1 * y = (block_q8_1 *) vy;
1392
-
1393
- const int ib = i_padded / QK8_1; // block index
1394
- const int iqs = i_padded % QK8_1; // quant index
1395
- typedef sycl::vec<float, QUANT_BLOCK_TILE> TC;
1396
- typedef sycl::vec<int8_t, QUANT_BLOCK_TILE> TQ;
1397
- TC zeros;
1398
- TQ qzeros;
1399
- #pragma unroll
1400
- for (int i = 0; i < QUANT_BLOCK_TILE; i++)
1401
- {
1402
- zeros[i] = 0.f;
1403
- qzeros[i] = 0;
1404
- }
1405
- const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
1406
- float sum = xi[0];
1407
- float amax = sycl::fabs(xi[0]);
1408
- #pragma unroll
1409
- for (int i = 1; i < QUANT_BLOCK_TILE; i++)
1410
- {
1411
- sum += xi[i];
1412
- amax = sycl::fmax(sycl::fabs(xi[i]), amax);
1413
- }
1414
- sum = warp_reduce_sum(sum, item_ct1);
1415
- amax = warp_reduce_max(amax, item_ct1);
1416
-
1417
- const float d = amax / 127;
1418
- TQ q = qzeros;
1419
- if (amax != 0.0f)
1420
- {
1421
- #pragma unroll
1422
- for (int i = 0; i < QUANT_BLOCK_TILE; i++) {
1423
- q[i] = sycl::round(xi[i] / d);
1424
- }
1425
- }
1426
-
1427
- *(TQ *)&y[ib].qs[iqs] = q;
1428
-
1429
- if (iqs > 0) {
1430
- return;
1431
- }
1432
-
1433
- reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
1434
- reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
1435
- }
1436
-
1437
1377
  static void mul_mat_p021_f16_f32(
1438
1378
  const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1439
1379
  const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
@@ -1493,7 +1433,7 @@ static void mul_mat_p021_f16_f32(
1493
1433
 
1494
1434
  static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1495
1435
  const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1496
- const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
1436
+ const int row_stride_x, const int channel_stride_x,const int channel_stride_y, const int channel_x_divisor,
1497
1437
  const sycl::nd_item<3> &item_ct1) {
1498
1438
 
1499
1439
  const sycl::half *x = (const sycl::half *)vx;
@@ -1504,7 +1444,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1504
1444
  item_ct1.get_local_id(0);
1505
1445
  const int channel_x = channel / channel_x_divisor;
1506
1446
 
1507
- const int nrows_y = ncols_x;
1508
1447
  const int nrows_dst = nrows_x;
1509
1448
  const int row_dst = row_x;
1510
1449
 
@@ -1523,7 +1462,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1523
1462
  const int row_y = col_x;
1524
1463
 
1525
1464
  const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
1526
- const int iy = channel*nrows_y + row_y;
1465
+ const int iy = channel * channel_stride_y + row_y;
1527
1466
 
1528
1467
  const float xi =
1529
1468
  sycl::vec<sycl::half, 1>(x[ix])
@@ -1643,7 +1582,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
1643
1582
  dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
1644
1583
  }
1645
1584
 
1646
- static void scale_f32(const float * x, float * dst, const float scale, const int k,
1585
+ static void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k,
1647
1586
  const sycl::nd_item<3> &item_ct1) {
1648
1587
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
1649
1588
  item_ct1.get_local_id(2);
@@ -1652,7 +1591,7 @@ static void scale_f32(const float * x, float * dst, const float scale, const int
1652
1591
  return;
1653
1592
  }
1654
1593
 
1655
- dst[i] = scale * x[i];
1594
+ dst[i] = scale * x[i] + bias;
1656
1595
  }
1657
1596
 
1658
1597
 
@@ -1718,25 +1657,6 @@ static void pool2d_nchw_kernel(
1718
1657
  o_ptr[cur_oh * ow + cur_ow] = res;
1719
1658
  }
1720
1659
 
1721
- static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1722
- const int ky, const int kx_padded,
1723
- queue_ptr stream) {
1724
- const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
1725
- const sycl::range<3> num_blocks(1, ky, block_num_x);
1726
- int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
1727
- static_assert(QK8_1 % WARP_SIZE == 0);
1728
- const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
1729
- {
1730
- dpct::has_capability_or_fail(stream->get_device(),
1731
- {sycl::aspect::fp16});
1732
-
1733
- stream->parallel_for(
1734
- sycl::nd_range<3>(num_blocks * block_size, block_size),
1735
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1736
- quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
1737
- });
1738
- }
1739
- }
1740
1660
 
1741
1661
  static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
1742
1662
  float *dst, const int ncols_x,
@@ -1763,7 +1683,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
1763
1683
  static void ggml_mul_mat_vec_nc_f16_f32_sycl(
1764
1684
  const void *vx, const float *y, float *dst, const int ncols_x,
1765
1685
  const int nrows_x, const int row_stride_x, const int nchannels_x,
1766
- const int nchannels_y, const int channel_stride_x, queue_ptr stream) {
1686
+ const int nchannels_y, const int channel_stride_x, const int channel_stride_y, queue_ptr stream) {
1767
1687
 
1768
1688
  const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
1769
1689
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -1775,7 +1695,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
1775
1695
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1776
1696
  [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1777
1697
  mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
1778
- row_stride_x, channel_stride_x,
1698
+ row_stride_x, channel_stride_x, channel_stride_y,
1779
1699
  nchannels_y / nchannels_x, item_ct1);
1780
1700
  });
1781
1701
  }
@@ -1783,7 +1703,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
1783
1703
 
1784
1704
 
1785
1705
 
1786
- static void scale_f32_sycl(const float *x, float *dst, const float scale,
1706
+ static void scale_f32_sycl(const float *x, float *dst, const float scale, const float bias,
1787
1707
  const int k, queue_ptr stream) {
1788
1708
  const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
1789
1709
  stream->parallel_for(
@@ -1791,7 +1711,7 @@ static void scale_f32_sycl(const float *x, float *dst, const float scale,
1791
1711
  sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE),
1792
1712
  sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)),
1793
1713
  [=](sycl::nd_item<3> item_ct1) {
1794
- scale_f32(x, dst, scale, k, item_ct1);
1714
+ scale_f32(x, dst, scale, bias, k, item_ct1);
1795
1715
  });
1796
1716
  }
1797
1717
 
@@ -2066,21 +1986,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
2066
1986
  const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
2067
1987
  ? (const sycl::half *)src1->data + src1_padded_row_size
2068
1988
  : src1_as_f16.get();
2069
- ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
2070
1989
 
2071
1990
  #if GGML_SYCL_DNNL
2072
1991
  if (!g_ggml_sycl_disable_dnn) {
2073
- DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
2074
- DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2075
- dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
2076
- scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
2077
- " : converting dst to fp32");
2078
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2079
- to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
1992
+ DnnlGemmWrapper::row_gemm(ctx,row_diff, src1_ncols , ne10, src0_ptr,
1993
+ DnnlGemmWrapper::to_dt<sycl::half>(), src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
1994
+ dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2080
1995
  }
2081
1996
  else
2082
1997
  #endif
2083
1998
  {
1999
+ ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
2000
+
2084
2001
  const sycl::half alpha_f16 = 1.0f;
2085
2002
  const sycl::half beta_f16 = 0.0f;
2086
2003
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
@@ -2119,8 +2036,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
2119
2036
 
2120
2037
  #if GGML_SYCL_DNNL
2121
2038
  if (!g_ggml_sycl_disable_dnn) {
2122
- DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i,
2123
- DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
2039
+ DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
2040
+ DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
2124
2041
  dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2125
2042
  }
2126
2043
  else
@@ -2268,9 +2185,11 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * ds
2268
2185
  float * dst_dd = static_cast<float *>(dst->data);
2269
2186
 
2270
2187
  float scale;
2271
- memcpy(&scale, dst->op_params, sizeof(float));
2188
+ float bias;
2189
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
2190
+ memcpy(&bias, (float *) dst->op_params + 1, sizeof(float));
2272
2191
 
2273
- scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(dst->src[0]), main_stream);
2192
+ scale_f32_sycl(src0_dd, dst_dd, scale, bias, ggml_nelements(dst->src[0]), main_stream);
2274
2193
  /*
2275
2194
  DPCT1010:87: SYCL uses exceptions to report errors and does not use the
2276
2195
  error codes. The call was replaced with 0. You need to rewrite this code.
@@ -2319,10 +2238,10 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
2319
2238
  peer_access_enabled = enable_peer_access;
2320
2239
  }
2321
2240
 
2241
+ template <template <int> typename quantize_f>
2322
2242
  static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2323
2243
  const ggml_tensor *src1, ggml_tensor *dst,
2324
- ggml_sycl_op_mul_mat_t op,
2325
- const bool convert_src1_to_q8_1) try {
2244
+ ggml_sycl_op_mul_mat_t op) try {
2326
2245
 
2327
2246
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
2328
2247
 
@@ -2417,6 +2336,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2417
2336
  }
2418
2337
  }
2419
2338
 
2339
+ constexpr bool quantize_enabled = !std::is_same_v<quantize_f<QK8_1 / WARP_SIZE>,
2340
+ no_quantize_q8_1<QK8_1 / WARP_SIZE>>;
2420
2341
  for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
2421
2342
  if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
2422
2343
  continue;
@@ -2442,19 +2363,19 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2442
2363
  dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
2443
2364
  }
2444
2365
 
2445
- if (convert_src1_to_q8_1) {
2366
+ if constexpr(quantize_enabled) {
2446
2367
  dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
2447
2368
 
2448
2369
  if (src1_on_device && src1_is_contiguous) {
2449
2370
  scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
2450
2371
  /*num_src=*/2, " : converting src1 to Q8_1");
2451
- quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
2452
- /*
2453
- DPCT1010:90: SYCL uses exceptions to report errors and does not
2454
- use the error codes. The call was replaced with 0. You need to
2455
- rewrite this code.
2456
- */
2457
- SYCL_CHECK(0);
2372
+ try {
2373
+ quantize_row_q8_1_sycl<quantize_f>(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
2374
+ } catch (sycl::exception const &exc) {
2375
+ std::cerr << "Quantize_row_q8_1_sycl error" << exc.what() << "Exception caught at file:" << __FILE__
2376
+ << ", line:" << __LINE__ << std::endl;
2377
+ std::exit(1);
2378
+ }
2458
2379
  }
2459
2380
  }
2460
2381
 
@@ -2470,11 +2391,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2470
2391
  // here an event is recorded that signals that the main device has finished calculating the input data
2471
2392
  if (split && used_devices > 1) {
2472
2393
  ggml_sycl_set_device(ctx.device);
2473
- /*
2474
- DPCT1024:91: The original code returned the error code that was further
2475
- consumed by the program logic. This original code was replaced with 0.
2476
- You may need to rewrite the program logic consuming the error code.
2477
- */
2478
2394
  SYCL_CHECK(CHECK_TRY_ERROR(
2479
2395
  *src0_extra->events[ctx.device][0] =
2480
2396
  ctx.stream()->ext_oneapi_submit_barrier()));
@@ -2498,11 +2414,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2498
2414
 
2499
2415
  // wait for main GPU data if necessary
2500
2416
  if (split && (i != ctx.device || is != 0)) {
2501
- /*
2502
- DPCT1009:163: SYCL uses exceptions to report errors and does not
2503
- use the error codes. The original code was commented out and a
2504
- warning string was inserted. You need to rewrite this code.
2505
- */
2506
2417
  SYCL_CHECK(CHECK_TRY_ERROR(stream->ext_oneapi_submit_barrier(
2507
2418
  {*src0_extra->events[ctx.device][0]})));
2508
2419
  }
@@ -2528,39 +2439,42 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2528
2439
  // copy src0, src1 to device if necessary
2529
2440
  if (src1_is_contiguous) {
2530
2441
  if (i != ctx.device) {
2531
- if (convert_src1_to_q8_1) {
2442
+ if constexpr (quantize_enabled) {
2532
2443
  char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
2533
- SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
2534
- src1_ddq_i, src1_ddq_i_source,
2535
- src1_ncols * src1_padded_col_size * q8_1_ts /
2536
- q8_1_bs).wait()));
2444
+ SYCL_CHECK(
2445
+ CHECK_TRY_ERROR(stream
2446
+ ->memcpy(src1_ddq_i, src1_ddq_i_source,
2447
+ src1_ncols * src1_padded_col_size * q8_1_ts / q8_1_bs)
2448
+ .wait()));
2537
2449
  } else {
2538
-
2539
2450
  float * src1_ddf_i_source = (float *) src1_extra->data_device[ctx.device];
2540
- src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
2451
+ src1_ddf_i_source += (i0 * ne11 + src1_col_0) * ne10;
2541
2452
 
2542
- SYCL_CHECK(CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream,
2543
- src1_ddf_i, src1_ddf_i_source,
2544
- src1_ncols * ne10 * sizeof(float))));
2453
+ SYCL_CHECK(
2454
+ CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream, src1_ddf_i, src1_ddf_i_source,
2455
+ src1_ncols * ne10 * sizeof(float))));
2545
2456
  }
2546
2457
  }
2547
- } else if (src1_on_device && !src1_is_contiguous) {
2548
- SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
2549
- src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
2550
2458
  } else {
2551
- GGML_ABORT("fatal error");
2552
- }
2459
+ if (src1_on_device) {
2460
+ SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, src1_col_0,
2461
+ src1_col_0 + src1_ncols, stream));
2462
+ } else {
2463
+ GGML_ABORT("src1 is non-contiguous and not on device");
2464
+ }
2553
2465
 
2554
- if (convert_src1_to_q8_1 && !src1_is_contiguous) {
2555
- scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
2556
- /*num_src=*/2, " : converting src1 to Q8_1");
2557
- quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
2558
- /*
2559
- DPCT1010:92: SYCL uses exceptions to report errors and does
2560
- not use the error codes. The call was replaced with 0. You
2561
- need to rewrite this code.
2562
- */
2563
- SYCL_CHECK(0);
2466
+ if constexpr (quantize_enabled) {
2467
+ scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
2468
+ /*num_src=*/2, " : converting src1 to Q8_1");
2469
+ try {
2470
+ quantize_row_q8_1_sycl<quantize_q8_1>(src1_ddf_i, src1_ddq_i, ne10, src1_ncols,
2471
+ src1_padded_col_size, stream);
2472
+ } catch (const sycl::exception & exc) {
2473
+ std::cerr << "Quantize_row_q8_1_sycl error" << exc.what()
2474
+ << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
2475
+ std::exit(1);
2476
+ }
2477
+ }
2564
2478
  }
2565
2479
 
2566
2480
  if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
@@ -2572,12 +2486,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2572
2486
  // do the computation
2573
2487
  SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
2574
2488
  dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
2575
- /*
2576
- DPCT1010:93: SYCL uses exceptions to report errors and does not
2577
- use the error codes. The call was replaced with 0. You need to
2578
- rewrite this code.
2579
- */
2580
- SYCL_CHECK(0);
2581
2489
 
2582
2490
  // copy dst to host or other device if necessary
2583
2491
  if (!dst_on_device) {
@@ -2608,12 +2516,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2608
2516
 
2609
2517
  // add event for the main device to wait on until other device is done
2610
2518
  if (split && (i != ctx.device || is != 0)) {
2611
- /*
2612
- DPCT1024:94: The original code returned the error code that
2613
- was further consumed by the program logic. This original
2614
- code was replaced with 0. You may need to rewrite the
2615
- program logic consuming the error code.
2616
- */
2617
2519
  SYCL_CHECK(CHECK_TRY_ERROR(
2618
2520
  *src0_extra->events[i][is] =
2619
2521
  stream->ext_oneapi_submit_barrier()));
@@ -2712,6 +2614,8 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
2712
2614
  GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
2713
2615
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
2714
2616
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
2617
+ GGML_ASSERT(src1->ne[1] == 1);
2618
+ GGML_ASSERT(src1->ne[3] == 1);
2715
2619
 
2716
2620
  const int64_t ne00 = src0->ne[0];
2717
2621
  const int64_t ne01 = src0->ne[1];
@@ -2721,6 +2625,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
2721
2625
  const int64_t nb02 = src0->nb[2];
2722
2626
 
2723
2627
  const int64_t ne12 = src1->ne[2];
2628
+ const int64_t nb11 = src1->nb[1];
2724
2629
 
2725
2630
  SYCL_CHECK(ggml_sycl_set_device(ctx.device));
2726
2631
  queue_ptr main_stream = ctx.stream();
@@ -2731,8 +2636,9 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
2731
2636
 
2732
2637
  const int64_t row_stride_x = nb01 / sizeof(sycl::half);
2733
2638
  const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
2639
+ const int64_t channel_stride_y = nb11 / sizeof(float);
2734
2640
 
2735
- ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
2641
+ ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,channel_stride_y, main_stream);
2736
2642
  }
2737
2643
  catch (sycl::exception const &exc) {
2738
2644
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -2786,8 +2692,11 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2786
2692
  float * dst_ddf = static_cast<float *>(dst->data);
2787
2693
 
2788
2694
  const sycl::half * src1_f16 = static_cast<const sycl::half *>(src1->data);
2695
+ const size_t type_size_src0 = ggml_type_size(src0->type);
2789
2696
  const size_t type_size_src1 = ggml_type_size(src1->type);
2790
- GGML_ASSERT(nb10 == type_size_src1);
2697
+
2698
+ bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
2699
+ bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
2791
2700
 
2792
2701
  // SRC1 strides
2793
2702
  int64_t s11 = nb11 / type_size_src1;
@@ -2799,16 +2708,47 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2799
2708
  if (src1->type != GGML_TYPE_F16) {
2800
2709
  scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
2801
2710
  " : converting src1 to fp16");
2802
- const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
2803
- GGML_ASSERT(to_fp16_nc_sycl != nullptr);
2711
+
2712
+ // iterate tensor dims and find the slowest moving dim and stride
2713
+ int last_dim=0;
2714
+ int last_str=0;
2715
+ size_t largest_str=0;
2716
+ for(int i = 0; i< 4; i++){
2717
+ // last stride is always the largest
2718
+ if(src1->nb[i] == largest_str){
2719
+ if(src1->ne[last_dim] == 1){
2720
+ last_str = i;
2721
+ last_dim = i;
2722
+ }
2723
+ }
2724
+ if(src1->nb[i] > largest_str){
2725
+ largest_str = src1->nb[i];
2726
+ last_str = i;
2727
+ last_dim = i;
2728
+ }
2729
+
2730
+ }
2731
+ #if GGML_SYCL_DNNL
2732
+ // oneDNN handles strided data and does not need overhead of get_to_fp16_nc_sycl
2733
+ const int64_t ne_src1 = src1->nb[last_str] * src1->ne[last_dim] / type_size_src1;
2734
+ src1_f16_alloc.alloc(ne_src1);
2735
+ const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
2736
+ GGML_ASSERT(to_fp16_sycl != nullptr);
2737
+ to_fp16_sycl(src1_f16, src1_f16_alloc.get(), ne_src1, queue);
2738
+ # else
2804
2739
  const int64_t ne_src1 = ggml_nelements(src1);
2805
2740
  src1_f16_alloc.alloc(ne_src1);
2741
+ const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
2742
+ GGML_ASSERT(to_fp16_nc_sycl != nullptr);
2806
2743
  to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue);
2744
+ #endif
2807
2745
 
2808
2746
  src1_f16 = src1_f16_alloc.get();
2809
2747
  s11 = ne10;
2810
2748
  s12 = ne11 * s11;
2811
2749
  s13 = ne12 * s12;
2750
+
2751
+ is_src1_cont_2 = true;
2812
2752
  }
2813
2753
 
2814
2754
  ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
@@ -2837,48 +2777,115 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2837
2777
 
2838
2778
  #if GGML_SYCL_DNNL
2839
2779
  if (!g_ggml_sycl_disable_dnn) {
2840
- auto dnn_gemm = [&ctx, queue, ne11, ne01, ne10, nb00, nb01, nb02, s11, s12]
2841
- (const sycl::half* src1, const sycl::half* src0, float* dst, const dnnl_dim_t batches_a, const dnnl_dim_t batches_b) {
2842
-
2843
- DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
2844
- src1, DnnlGemmWrapper::to_dt<sycl::half>(), s11, 1, s12,
2845
- src0, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
2846
- dst, DnnlGemmWrapper::to_dt<float>(), queue, batches_a, batches_b);
2847
- };
2848
-
2849
- if (r2 == 1 && r3 == 1) {
2850
- if (ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
2851
- dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03);
2852
- }
2853
- else {
2854
- for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
2855
- const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes
2856
- const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;
2857
- float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));
2858
- dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);
2780
+ int64_t str_a0 = nb00 / type_size_src0;
2781
+ int64_t str_a1 = nb01 / type_size_src0;
2782
+ int64_t str_a2 = nb02 / type_size_src0;
2783
+
2784
+ int64_t str_b0 = nb10 / type_size_src1;
2785
+ int64_t str_b1 = nb11 / type_size_src1;
2786
+ int64_t str_b2 = nb12 / type_size_src1;
2787
+
2788
+ auto launch_gemm_for_batches = [&ctx, queue](const sycl::half *src0,
2789
+ const sycl::half *src1, float *dst,
2790
+ int64_t a0, int64_t a1, int64_t batcha,
2791
+ int64_t /*b0*/, int64_t b1, int64_t batchb,
2792
+ int64_t sa0, int64_t sa1, int64_t sa2,
2793
+ int64_t sb0, int64_t sb1, int64_t sb2,
2794
+ int64_t sd2) {
2795
+ bool supported_broadcast = batchb == batcha ? true
2796
+ : batchb == 1 || batcha == 1 ? true
2797
+ : false;
2798
+ if (supported_broadcast) {
2799
+ DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0,
2800
+ DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2, src1,
2801
+ DnnlGemmWrapper::to_dt<sycl::half>(), sb0, sb1, sb2, dst,
2802
+ DnnlGemmWrapper::to_dt<float>(), queue, batcha, batchb);
2803
+ } else {
2804
+ // iterate over batches from smaller set of matrices (matrix 0)
2805
+ int64_t batches0 = batcha;
2806
+ int64_t batches1 = batchb;
2807
+
2808
+ if (batches0 > batches1) {
2809
+ int64_t num_mul_mats = batches1;
2810
+ int64_t sub_batch = batches0 / num_mul_mats;
2811
+ // src0 is batched and bigger, shift and multiply with src1
2812
+ for (int64_t i0 = 0; i0 < num_mul_mats; i0++) {
2813
+ const sycl::half *src0_shifted = src0 + (sa2 * i0 * sub_batch);
2814
+ const sycl::half *src1_shifted = src1 + (sb2 * i0);
2815
+ float *dst_shifted = dst + (sd2 * i0 * sub_batch);
2816
+ DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted,
2817
+ DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2,
2818
+ src1_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), sb0,
2819
+ sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt<float>(),
2820
+ queue, sub_batch, 1);
2821
+ }
2822
+ } else {
2823
+ int64_t num_mul_mats = batches0;
2824
+ int64_t sub_batch = batches1 / num_mul_mats;
2825
+ // src1 is batched and bigger, shift and multiply with src0
2826
+ for (int64_t i1 = 0; i1 < num_mul_mats; i1++) {
2827
+ const sycl::half *src0_shifted = src0 + (sa2 * i1);
2828
+ const sycl::half *src1_shifted = src1 + (sb2 * i1 * sub_batch);
2829
+ float *dst_shifted = dst + (sd2 * i1 * sub_batch);
2830
+ DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted,
2831
+ DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2,
2832
+ src1_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), sb0,
2833
+ sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt<float>(),
2834
+ queue, 1, sub_batch);
2835
+ }
2836
+ }
2859
2837
  }
2860
- }
2861
- } else {
2862
- // iterate over batches from smaller set of matrices (matrix 0)
2863
- for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
2864
- for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
2865
- const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half));
2866
- const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3;
2867
- float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float));
2868
- dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);
2838
+ };
2839
+
2840
+ const bool cont_batches_dim2_a = nb02 * ne02 == nb03;
2841
+ const bool cont_batches_dim2_b = nb12 * ne12 == nb13;
2842
+ const bool cont_batches_dim3_a = ne02 == 1 && nb02 * ne01 == nb03;
2843
+ const bool cont_batches_dim3_b = ne12 == 1 && nb12 * ne11 == nb13;
2844
+ if (cont_batches_dim2_a && cont_batches_dim2_b) {
2845
+ // A batch is considered contiguous if the dimension 2 is not strided
2846
+ int64_t batches0 = ne02 * ne03;
2847
+ int64_t batches1 = ne12 * ne13;
2848
+ launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
2849
+ ne10, ne11, batches1, str_a0, str_a1, str_a2, str_b0, str_b1,
2850
+ str_b2, nb2 / sizeof(float));
2851
+ } else if (cont_batches_dim3_a && cont_batches_dim3_b) {
2852
+ // This case is similar to the one above with the difference that only the batch in dimension 3 is used and the dimension 2 is of size 1.
2853
+ int64_t batches0 = ne02 * ne03;
2854
+ int64_t batches1 = ne12 * ne13;
2855
+ int64_t str_a3 = nb03 / type_size_src0;
2856
+ int64_t str_b3 = nb13 / type_size_src1;
2857
+ launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
2858
+ ne10, ne11, batches1, str_a0, str_a1, str_a3, str_b0, str_b1,
2859
+ str_b3, nb2 / sizeof(float));
2860
+ } else {
2861
+ for (int64_t b_a = 0; b_a < ne03; b_a++) {
2862
+ const sycl::half *src0_f16_shifted
2863
+ = src0_f16 + (nb03 * b_a / type_size_src0);
2864
+ const sycl::half *src1_f16_shifted
2865
+ = src1_f16 + (nb13 * b_a / type_size_src1);
2866
+ float *dst_shifted = dst_ddf + (nb3 * b_a / sizeof(float));
2867
+ int64_t batches0 = ne02;
2868
+ int64_t batches1 = ne12;
2869
+ launch_gemm_for_batches(src0_f16_shifted, src1_f16_shifted, dst_shifted,
2870
+ ne00, ne01, batches0, ne10, ne11, batches1, str_a0, str_a1,
2871
+ str_a2, str_b0, str_b1, str_b2, nb2 / sizeof(float));
2869
2872
  }
2870
2873
  }
2871
- }
2874
+
2872
2875
  }
2873
2876
  else
2874
2877
  #endif
2875
2878
  {
2876
- if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
2879
+ if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
2880
+ // with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
2881
+ const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
2882
+ const int64_t smb = ne12 == 1 ? s13 : s12;
2883
+
2877
2884
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
2878
2885
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
2879
2886
  oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
2880
- src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
2881
- src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
2887
+ src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
2888
+ src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
2882
2889
  mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
2883
2890
  } else {
2884
2891
  const int ne23 = ne12 * ne13;
@@ -2928,6 +2935,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
2928
2935
  case GGML_TYPE_Q4_0:
2929
2936
  return true;
2930
2937
  case GGML_TYPE_Q4_K:
2938
+ case GGML_TYPE_Q6_K:
2931
2939
  return !g_ggml_sycl_prioritize_dmmv;
2932
2940
  default:
2933
2941
  return false;
@@ -2947,6 +2955,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
2947
2955
  switch (type) {
2948
2956
  case GGML_TYPE_Q4_0:
2949
2957
  case GGML_TYPE_Q4_K:
2958
+ case GGML_TYPE_Q6_K:
2950
2959
  return true;
2951
2960
  default:
2952
2961
  return false;
@@ -3031,6 +3040,50 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
3031
3040
  sycl::free(tmp_buf, *stream);
3032
3041
  }
3033
3042
 
3043
+ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3044
+ GGML_ASSERT(size % sizeof(block_q6_K) == 0);
3045
+ GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
3046
+
3047
+ const int nblocks = size / sizeof(block_q6_K);
3048
+
3049
+ auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
3050
+ SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
3051
+
3052
+ auto * ql_ptr = data_device;
3053
+ auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
3054
+ auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
3055
+ sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
3056
+
3057
+ stream
3058
+ ->parallel_for(nblocks,
3059
+ [=](auto i) {
3060
+ const block_q6_K * x = (const block_q6_K *) tmp_buf;
3061
+ const int ib = i;
3062
+
3063
+ const uint8_t * ql = x[ib].ql;
3064
+ const uint8_t * qh = x[ib].qh;
3065
+ uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
3066
+ uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
3067
+ uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
3068
+
3069
+ for (int j = 0; j < QK_K / 2; ++j) {
3070
+ base_ql_ptr[j] = ql[j];
3071
+ }
3072
+ for (int j = 0; j < QK_K / 4; ++j) {
3073
+ base_qh_ptr[j] = qh[j];
3074
+ }
3075
+
3076
+ for (int j = 0; j < QK_K / 16; ++j) {
3077
+ base_scales_ptr[j] = x[ib].scales[j];
3078
+ }
3079
+
3080
+ dm_ptr[ib] = x[ib].d;
3081
+ })
3082
+ .wait_and_throw();
3083
+
3084
+ sycl::free(tmp_buf, *stream);
3085
+ }
3086
+
3034
3087
  static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
3035
3088
  uint8_t * data_device = (uint8_t *) src0->data;
3036
3089
  size_t ncols = src0->ne[0];
@@ -3044,6 +3097,9 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
3044
3097
  case GGML_TYPE_Q4_K:
3045
3098
  reorder_qw_q4_k(data_device, size, 0, stream);
3046
3099
  break;
3100
+ case GGML_TYPE_Q6_K:
3101
+ reorder_qw_q6_k(data_device, size, 0, stream);
3102
+ break;
3047
3103
  default:
3048
3104
  GGML_ABORT("reorder_qw() called with unsupported type");
3049
3105
  break;
@@ -3159,26 +3215,27 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
3159
3215
  // The kernel from the if path is faster for that specific case, but does not support all mul mats.
3160
3216
  ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
3161
3217
  }
3162
- } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
3218
+ } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1 && src1->ne[3] == 1) {
3163
3219
  // KQV single-batch
3164
3220
  ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
3165
- } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
3221
+ } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2] * src1->ne[3] > 1) {
3166
3222
  // KQ + KQV multi-batch
3167
3223
  ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
3168
3224
  } else if (use_dequantize_mul_mat_vec) {
3169
- constexpr bool convert_src1_to_q8_1 = false;
3170
3225
  opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::DMMV);
3171
- ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1);
3226
+ ggml_sycl_op_mul_mat<no_quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec);
3172
3227
  } else if (use_mul_mat_vec_q) {
3173
- constexpr bool convert_src1_to_q8_1 = true;
3174
3228
  opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MMVQ);
3175
- ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1);
3229
+ ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
3230
+ if (extra && extra->optimized_feature.reorder) {
3231
+ ggml_sycl_op_mul_mat<quantize_and_reorder_q8_1_soa>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q);
3232
+ } else {
3233
+ ggml_sycl_op_mul_mat<quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q);
3234
+ }
3176
3235
  } else if (use_mul_mat_q) {
3177
- constexpr bool convert_src1_to_q8_1 = true;
3178
- ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
3236
+ ggml_sycl_op_mul_mat<quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q);
3179
3237
  } else {
3180
- constexpr bool convert_src1_to_q8_1 = false;
3181
- ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
3238
+ ggml_sycl_op_mul_mat<no_quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl);
3182
3239
  }
3183
3240
  }
3184
3241
 
@@ -3345,8 +3402,11 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3345
3402
  SYCL_CHECK(CHECK_TRY_ERROR(
3346
3403
  stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
3347
3404
 
3405
+ const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
3406
+ assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
3407
+
3348
3408
  {
3349
- sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u));
3409
+ sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
3350
3410
  sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
3351
3411
  stream->submit([&](sycl::handler &cgh) {
3352
3412
  sycl::local_accessor<int, 0> src1_row_acc(cgh);
@@ -3391,7 +3451,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3391
3451
  ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
3392
3452
 
3393
3453
  {
3394
- sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u));
3454
+ sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
3395
3455
  sycl::range<3> grid_dims(1, 1, num_src1_rows);
3396
3456
  stream->submit([&](sycl::handler &cgh) {
3397
3457
  const char *__restrict dst_contiguous_get =
@@ -3504,6 +3564,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3504
3564
  case GGML_OP_GET_ROWS:
3505
3565
  ggml_sycl_get_rows(ctx, dst);
3506
3566
  break;
3567
+ case GGML_OP_SET_ROWS:
3568
+ ggml_sycl_op_set_rows(ctx, dst);
3569
+ break;
3507
3570
  case GGML_OP_DUP:
3508
3571
  ggml_sycl_dup(ctx, dst);
3509
3572
  break;
@@ -3514,6 +3577,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3514
3577
  case GGML_OP_SUB:
3515
3578
  ggml_sycl_sub(ctx, dst);
3516
3579
  break;
3580
+ case GGML_OP_COUNT_EQUAL:
3581
+ ggml_sycl_count_equal(ctx, dst);
3582
+ break;
3517
3583
  case GGML_OP_ACC:
3518
3584
  ggml_sycl_acc(ctx, dst);
3519
3585
  break;
@@ -3543,6 +3609,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3543
3609
  case GGML_UNARY_OP_GELU_QUICK:
3544
3610
  ggml_sycl_gelu_quick(ctx, dst);
3545
3611
  break;
3612
+ case GGML_UNARY_OP_GELU_ERF:
3613
+ ggml_sycl_gelu_erf(ctx, dst);
3614
+ break;
3546
3615
  case GGML_UNARY_OP_TANH:
3547
3616
  ggml_sycl_tanh(ctx, dst);
3548
3617
  break;
@@ -3574,6 +3643,27 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3574
3643
  return false;
3575
3644
  }
3576
3645
  break;
3646
+ case GGML_OP_GLU:
3647
+ switch (ggml_get_glu_op(dst)) {
3648
+ case GGML_GLU_OP_REGLU:
3649
+ ggml_sycl_reglu(ctx, dst);
3650
+ break;
3651
+ case GGML_GLU_OP_GEGLU:
3652
+ ggml_sycl_geglu(ctx, dst);
3653
+ break;
3654
+ case GGML_GLU_OP_SWIGLU:
3655
+ ggml_sycl_swiglu(ctx, dst);
3656
+ break;
3657
+ case GGML_GLU_OP_GEGLU_ERF:
3658
+ ggml_sycl_geglu_erf(ctx, dst);
3659
+ break;
3660
+ case GGML_GLU_OP_GEGLU_QUICK:
3661
+ ggml_sycl_geglu_quick(ctx, dst);
3662
+ break;
3663
+ default:
3664
+ return false;
3665
+ }
3666
+ break;
3577
3667
  case GGML_OP_NORM:
3578
3668
  ggml_sycl_norm(ctx, dst);
3579
3669
  break;
@@ -3752,7 +3842,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
3752
3842
  const void *data, size_t offset,
3753
3843
  size_t size) try {
3754
3844
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3755
- debug_print_tensor(": tensor=", tensor);
3845
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
3756
3846
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
3757
3847
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3758
3848
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -3773,7 +3863,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
3773
3863
  void *data, size_t offset,
3774
3864
  size_t size) try {
3775
3865
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3776
- debug_print_tensor(": tensor=", tensor);
3866
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
3777
3867
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
3778
3868
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3779
3869
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -3796,8 +3886,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
3796
3886
  bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
3797
3887
  ggml_backend_buffer_is_sycl(src->buffer);
3798
3888
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3799
- debug_print_tensor(": dst=", dst);
3800
- debug_print_tensor(" src=", src);
3889
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
3890
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
3801
3891
  GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
3802
3892
  if (is_cpy_supported) {
3803
3893
  /*
@@ -3983,6 +4073,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
3983
4073
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
3984
4074
  /* .event_record = */ ggml_backend_sycl_event_record,
3985
4075
  /* .event_wait = */ ggml_backend_sycl_event_wait,
4076
+ /* .graph_optimize = */ NULL,
3986
4077
  };
3987
4078
 
3988
4079
  static ggml_guid_t ggml_backend_sycl_guid() {
@@ -4096,6 +4187,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4096
4187
  case GGML_UNARY_OP_HARDSIGMOID:
4097
4188
  case GGML_UNARY_OP_HARDSWISH:
4098
4189
  case GGML_UNARY_OP_GELU_QUICK:
4190
+ case GGML_UNARY_OP_GELU_ERF:
4099
4191
  case GGML_UNARY_OP_TANH:
4100
4192
  case GGML_UNARY_OP_EXP:
4101
4193
  case GGML_UNARY_OP_SGN:
@@ -4109,18 +4201,24 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4109
4201
  default:
4110
4202
  return false;
4111
4203
  }
4204
+ case GGML_OP_GLU:
4205
+ switch (ggml_get_glu_op(op)) {
4206
+ case GGML_GLU_OP_REGLU:
4207
+ case GGML_GLU_OP_GEGLU:
4208
+ case GGML_GLU_OP_SWIGLU:
4209
+ case GGML_GLU_OP_GEGLU_ERF:
4210
+ case GGML_GLU_OP_GEGLU_QUICK:
4211
+ return ggml_is_contiguous_1(op->src[0]);
4212
+ default:
4213
+ return false;
4214
+ }
4215
+ break;
4112
4216
  case GGML_OP_MUL_MAT:
4113
4217
  case GGML_OP_MUL_MAT_ID:
4114
4218
  {
4115
- struct ggml_tensor * a;
4116
- struct ggml_tensor * b;
4117
- if (op->op == GGML_OP_MUL_MAT) {
4118
- a = op->src[0];
4119
- b = op->src[1];
4120
- } else {
4121
- a = op->src[2];
4122
- b = op->src[1];
4123
- }
4219
+ struct ggml_tensor * a = op->src[0];
4220
+ struct ggml_tensor * b = op->src[1];
4221
+
4124
4222
  if (a->ne[3] != b->ne[3]) {
4125
4223
  return false;
4126
4224
  }
@@ -4135,7 +4233,18 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4135
4233
  }
4136
4234
  }
4137
4235
  ggml_type src0_type = op->src[0]->type;
4138
- if (src0_type == GGML_TYPE_BF16) {
4236
+ if (src0_type == GGML_TYPE_BF16 || src0_type == GGML_TYPE_MXFP4) {
4237
+ // TODO: support MXFP4
4238
+ // FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
4239
+ return false;
4240
+ }
4241
+ // TODO: The configuration below needs more work to be supported with oneDNN
4242
+ if (ggml_is_permuted(a) && !ggml_is_contiguous(a) && a->ne[2] > 1 && a->ne[3] > 1) {
4243
+ return false;
4244
+ }
4245
+ // TODO: This specific configuration can fail with oneDNN and needs more debugging
4246
+ if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
4247
+ a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
4139
4248
  return false;
4140
4249
  }
4141
4250
  return true;
@@ -4157,10 +4266,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4157
4266
  return false;
4158
4267
  }
4159
4268
  }
4269
+ case GGML_OP_SET_ROWS:
4270
+ {
4271
+ return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
4272
+ op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q5_0 ||
4273
+ op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL) &&
4274
+ (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32));
4275
+ }
4276
+ break;
4160
4277
  case GGML_OP_CPY:
4161
4278
  {
4162
4279
  ggml_type src0_type = op->src[0]->type;
4163
4280
  ggml_type src1_type = op->src[1]->type;
4281
+ if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) && src0_type != GGML_TYPE_BF16) {
4282
+ return true;
4283
+ }
4164
4284
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
4165
4285
  return true;
4166
4286
  }
@@ -4206,6 +4326,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4206
4326
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
4207
4327
  return true;
4208
4328
  }
4329
+ if(src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_Q8_0) {
4330
+ return true;
4331
+ }
4332
+ if(src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_Q5_0) {
4333
+ return true;
4334
+ }
4335
+ if(src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_Q5_1) {
4336
+ return true;
4337
+ }
4338
+ if(src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_Q4_0) {
4339
+ return true;
4340
+ }
4341
+ if(src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_Q4_1) {
4342
+ return true;
4343
+ }
4209
4344
  return false;
4210
4345
  }
4211
4346
  case GGML_OP_CONCAT:
@@ -4224,6 +4359,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4224
4359
  case GGML_OP_ADD:
4225
4360
  case GGML_OP_ADD1:
4226
4361
  case GGML_OP_SUB:
4362
+ case GGML_OP_COUNT_EQUAL:
4227
4363
  case GGML_OP_MUL:
4228
4364
  case GGML_OP_DIV:
4229
4365
  case GGML_OP_REPEAT:
@@ -4240,37 +4376,44 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4240
4376
  return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
4241
4377
  #endif
4242
4378
  case GGML_OP_NORM:
4243
- case GGML_OP_RMS_NORM:
4244
4379
  return true;
4245
4380
  case GGML_OP_L2_NORM:
4246
4381
  case GGML_OP_GROUP_NORM:
4247
4382
  return ggml_is_contiguous(op->src[0]);
4383
+ case GGML_OP_RMS_NORM:
4384
+ return ((op->src[0]->ne[0] % WARP_SIZE) == 0);
4248
4385
  case GGML_OP_SCALE:
4249
4386
  return true;
4250
4387
  case GGML_OP_CONT:
4251
4388
  return op->src[0]->type != GGML_TYPE_BF16;
4252
- case GGML_OP_DIAG_MASK_INF:
4253
4389
  case GGML_OP_SOFT_MAX:
4254
- return true;
4255
- case GGML_OP_ROPE:
4256
- {
4257
- const int mode = ((const int32_t *) op->op_params)[2];
4258
- // mode is not used as a bitmask in practice, the various rope type modes are independent implementations
4259
- if (mode == GGML_ROPE_TYPE_MROPE) {
4260
- return false;
4261
- }
4262
- return true;
4390
+ // TODO: support batching
4391
+ if (op->src[0]->ne[3] != 1) {
4392
+ return false;
4393
+ }
4394
+ // TODO: support attention sinks [TAG_ATTN_SINKS]
4395
+ if (op->src[2]) {
4396
+ return false;
4263
4397
  }
4398
+ // TODO: support broadcast
4399
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14435
4400
+ return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
4401
+ case GGML_OP_DIAG_MASK_INF:
4402
+ case GGML_OP_ROPE:
4264
4403
  case GGML_OP_IM2COL:
4265
4404
  return true;
4266
4405
  case GGML_OP_UPSCALE:
4267
4406
  return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
4268
- case GGML_OP_POOL_2D:
4269
4407
  case GGML_OP_SUM:
4270
4408
  case GGML_OP_SUM_ROWS:
4271
4409
  case GGML_OP_ARGSORT:
4410
+ return ggml_is_contiguous(op->src[0]);
4411
+ case GGML_OP_POOL_2D:
4272
4412
  case GGML_OP_ACC:
4413
+ return true;
4273
4414
  case GGML_OP_PAD:
4415
+ return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
4416
+ (ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
4274
4417
  case GGML_OP_LEAKY_RELU:
4275
4418
  case GGML_OP_TIMESTEP_EMBEDDING:
4276
4419
  case GGML_OP_RWKV_WKV6:
@@ -4481,10 +4624,10 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
4481
4624
  };
4482
4625
 
4483
4626
  ggml_backend_t sycl_backend = new ggml_backend {
4484
- /* .guid = */ ggml_backend_sycl_guid(),
4485
- /* .interface = */ ggml_backend_sycl_interface,
4486
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
4487
- /* .context = */ ctx
4627
+ /* .guid = */ ggml_backend_sycl_guid(),
4628
+ /* .iface = */ ggml_backend_sycl_interface,
4629
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
4630
+ /* .context = */ ctx
4488
4631
  };
4489
4632
 
4490
4633
  return sycl_backend;