whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -1818,7 +1818,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
1818
1818
  dpct::has_capability_or_fail(stream->get_device(),
1819
1819
  {sycl::aspect::fp16});
1820
1820
 
1821
- stream->submit([&](sycl::handler &cgh) {
1821
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1822
1822
  sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
1823
1823
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
1824
1824
  sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
@@ -1829,9 +1829,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
1829
1829
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1830
1830
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1831
1831
 
1832
- cgh.parallel_for(
1833
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1834
- [=](sycl::nd_item<3> item_ct1) {
1832
+ sycl_parallel_for(
1833
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
1835
1834
  mul_mat_q4_0<need_check>(
1836
1835
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1837
1836
  nrows_dst, item_ct1,
@@ -1853,7 +1852,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
1853
1852
  dpct::has_capability_or_fail(stream->get_device(),
1854
1853
  {sycl::aspect::fp16});
1855
1854
 
1856
- stream->submit([&](sycl::handler &cgh) {
1855
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1857
1856
  sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
1858
1857
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
1859
1858
  sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
@@ -1864,9 +1863,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
1864
1863
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1865
1864
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1866
1865
 
1867
- cgh.parallel_for(
1868
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1869
- [=](sycl::nd_item<3> item_ct1) {
1866
+ sycl_parallel_for(
1867
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
1870
1868
  mul_mat_q4_0<need_check>(
1871
1869
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1872
1870
  nrows_dst, item_ct1,
@@ -1933,7 +1931,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
1933
1931
  dpct::has_capability_or_fail(stream->get_device(),
1934
1932
  {sycl::aspect::fp16});
1935
1933
 
1936
- stream->submit([&](sycl::handler &cgh) {
1934
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1937
1935
  sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
1938
1936
  sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
1939
1937
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
@@ -1944,9 +1942,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
1944
1942
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1945
1943
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1946
1944
 
1947
- cgh.parallel_for(
1948
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1949
- [=](sycl::nd_item<3> item_ct1) {
1945
+ sycl_parallel_for(
1946
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
1950
1947
  mul_mat_q4_1<need_check>(
1951
1948
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1952
1949
  nrows_dst, item_ct1,
@@ -1968,7 +1965,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
1968
1965
  dpct::has_capability_or_fail(stream->get_device(),
1969
1966
  {sycl::aspect::fp16});
1970
1967
 
1971
- stream->submit([&](sycl::handler &cgh) {
1968
+ sycl_launch(stream, [&](sycl::handler & cgh) {
1972
1969
  sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
1973
1970
  sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
1974
1971
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
@@ -1979,9 +1976,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
1979
1976
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
1980
1977
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
1981
1978
 
1982
- cgh.parallel_for(
1983
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1984
- [=](sycl::nd_item<3> item_ct1) {
1979
+ sycl_parallel_for(
1980
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
1985
1981
  mul_mat_q4_1<need_check>(
1986
1982
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
1987
1983
  nrows_dst, item_ct1,
@@ -2048,7 +2044,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
2048
2044
  dpct::has_capability_or_fail(stream->get_device(),
2049
2045
  {sycl::aspect::fp16});
2050
2046
 
2051
- stream->submit([&](sycl::handler &cgh) {
2047
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2052
2048
  sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
2053
2049
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2054
2050
  sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
@@ -2059,9 +2055,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
2059
2055
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2060
2056
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2061
2057
 
2062
- cgh.parallel_for(
2063
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2064
- [=](sycl::nd_item<3> item_ct1) {
2058
+ sycl_parallel_for(
2059
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2065
2060
  mul_mat_q5_0<need_check>(
2066
2061
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2067
2062
  nrows_dst, item_ct1,
@@ -2083,7 +2078,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
2083
2078
  dpct::has_capability_or_fail(stream->get_device(),
2084
2079
  {sycl::aspect::fp16});
2085
2080
 
2086
- stream->submit([&](sycl::handler &cgh) {
2081
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2087
2082
  sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
2088
2083
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2089
2084
  sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
@@ -2094,9 +2089,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
2094
2089
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2095
2090
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2096
2091
 
2097
- cgh.parallel_for(
2098
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2099
- [=](sycl::nd_item<3> item_ct1) {
2092
+ sycl_parallel_for(
2093
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2100
2094
  mul_mat_q5_0<need_check>(
2101
2095
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2102
2096
  nrows_dst, item_ct1,
@@ -2163,7 +2157,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
2163
2157
  dpct::has_capability_or_fail(stream->get_device(),
2164
2158
  {sycl::aspect::fp16});
2165
2159
 
2166
- stream->submit([&](sycl::handler &cgh) {
2160
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2167
2161
  sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
2168
2162
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2169
2163
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
@@ -2174,9 +2168,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
2174
2168
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2175
2169
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2176
2170
 
2177
- cgh.parallel_for(
2178
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2179
- [=](sycl::nd_item<3> item_ct1) {
2171
+ sycl_parallel_for(
2172
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2180
2173
  mul_mat_q5_1<need_check>(
2181
2174
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2182
2175
  nrows_dst, item_ct1,
@@ -2198,7 +2191,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
2198
2191
  dpct::has_capability_or_fail(stream->get_device(),
2199
2192
  {sycl::aspect::fp16});
2200
2193
 
2201
- stream->submit([&](sycl::handler &cgh) {
2194
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2202
2195
  sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
2203
2196
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2204
2197
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
@@ -2209,9 +2202,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
2209
2202
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2210
2203
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2211
2204
 
2212
- cgh.parallel_for(
2213
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2214
- [=](sycl::nd_item<3> item_ct1) {
2205
+ sycl_parallel_for(
2206
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2215
2207
  mul_mat_q5_1<need_check>(
2216
2208
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2217
2209
  nrows_dst, item_ct1,
@@ -2278,7 +2270,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
2278
2270
  dpct::has_capability_or_fail(stream->get_device(),
2279
2271
  {sycl::aspect::fp16});
2280
2272
 
2281
- stream->submit([&](sycl::handler &cgh) {
2273
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2282
2274
  sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
2283
2275
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2284
2276
  sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
@@ -2289,9 +2281,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
2289
2281
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2290
2282
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2291
2283
 
2292
- cgh.parallel_for(
2293
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2294
- [=](sycl::nd_item<3> item_ct1) {
2284
+ sycl_parallel_for(
2285
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2295
2286
  mul_mat_q8_0<need_check>(
2296
2287
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2297
2288
  nrows_dst, item_ct1,
@@ -2313,7 +2304,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
2313
2304
  dpct::has_capability_or_fail(stream->get_device(),
2314
2305
  {sycl::aspect::fp16});
2315
2306
 
2316
- stream->submit([&](sycl::handler &cgh) {
2307
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2317
2308
  sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
2318
2309
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2319
2310
  sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
@@ -2324,9 +2315,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
2324
2315
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2325
2316
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2326
2317
 
2327
- cgh.parallel_for(
2328
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2329
- [=](sycl::nd_item<3> item_ct1) {
2318
+ sycl_parallel_for(
2319
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2330
2320
  mul_mat_q8_0<need_check>(
2331
2321
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2332
2322
  nrows_dst, item_ct1,
@@ -2393,7 +2383,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
2393
2383
  dpct::has_capability_or_fail(stream->get_device(),
2394
2384
  {sycl::aspect::fp16});
2395
2385
 
2396
- stream->submit([&](sycl::handler &cgh) {
2386
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2397
2387
  sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
2398
2388
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2399
2389
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
@@ -2406,9 +2396,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
2406
2396
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2407
2397
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2408
2398
 
2409
- cgh.parallel_for(
2410
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2411
- [=](sycl::nd_item<3> item_ct1) {
2399
+ sycl_parallel_for(
2400
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2412
2401
  mul_mat_q2_K<need_check>(
2413
2402
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2414
2403
  nrows_dst, item_ct1,
@@ -2431,7 +2420,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
2431
2420
  dpct::has_capability_or_fail(stream->get_device(),
2432
2421
  {sycl::aspect::fp16});
2433
2422
 
2434
- stream->submit([&](sycl::handler &cgh) {
2423
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2435
2424
  sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
2436
2425
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2437
2426
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
@@ -2444,9 +2433,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
2444
2433
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2445
2434
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2446
2435
 
2447
- cgh.parallel_for(
2448
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2449
- [=](sycl::nd_item<3> item_ct1) {
2436
+ sycl_parallel_for(
2437
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2450
2438
  mul_mat_q2_K<need_check>(
2451
2439
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2452
2440
  nrows_dst, item_ct1,
@@ -2516,7 +2504,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
2516
2504
  dpct::has_capability_or_fail(stream->get_device(),
2517
2505
  {sycl::aspect::fp16});
2518
2506
 
2519
- stream->submit([&](sycl::handler &cgh) {
2507
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2520
2508
  sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
2521
2509
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2522
2510
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
@@ -2531,9 +2519,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
2531
2519
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2532
2520
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2533
2521
 
2534
- cgh.parallel_for(
2535
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2536
- [=](sycl::nd_item<3> item_ct1) {
2522
+ sycl_parallel_for(
2523
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2537
2524
  mul_mat_q3_K<need_check>(
2538
2525
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2539
2526
  nrows_dst, item_ct1,
@@ -2557,7 +2544,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
2557
2544
  dpct::has_capability_or_fail(stream->get_device(),
2558
2545
  {sycl::aspect::fp16});
2559
2546
 
2560
- stream->submit([&](sycl::handler &cgh) {
2547
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2561
2548
  sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
2562
2549
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2563
2550
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
@@ -2572,9 +2559,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
2572
2559
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2573
2560
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2574
2561
 
2575
- cgh.parallel_for(
2576
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2577
- [=](sycl::nd_item<3> item_ct1) {
2562
+ sycl_parallel_for(
2563
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2578
2564
  mul_mat_q3_K<need_check>(
2579
2565
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2580
2566
  nrows_dst, item_ct1,
@@ -2644,7 +2630,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
2644
2630
  dpct::has_capability_or_fail(stream->get_device(),
2645
2631
  {sycl::aspect::fp16});
2646
2632
 
2647
- stream->submit([&](sycl::handler &cgh) {
2633
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2648
2634
  sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
2649
2635
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2650
2636
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
@@ -2657,9 +2643,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
2657
2643
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2658
2644
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2659
2645
 
2660
- cgh.parallel_for(
2661
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2662
- [=](sycl::nd_item<3> item_ct1) {
2646
+ sycl_parallel_for(
2647
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2663
2648
  mul_mat_q4_K<need_check>(
2664
2649
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2665
2650
  nrows_dst, item_ct1,
@@ -2682,7 +2667,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
2682
2667
  dpct::has_capability_or_fail(stream->get_device(),
2683
2668
  {sycl::aspect::fp16});
2684
2669
 
2685
- stream->submit([&](sycl::handler &cgh) {
2670
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2686
2671
  sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
2687
2672
  sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
2688
2673
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
@@ -2695,9 +2680,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
2695
2680
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2696
2681
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2697
2682
 
2698
- cgh.parallel_for(
2699
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2700
- [=](sycl::nd_item<3> item_ct1) {
2683
+ sycl_parallel_for(
2684
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2701
2685
  mul_mat_q4_K<need_check>(
2702
2686
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2703
2687
  nrows_dst, item_ct1,
@@ -2765,7 +2749,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
2765
2749
  dpct::has_capability_or_fail(stream->get_device(),
2766
2750
  {sycl::aspect::fp16});
2767
2751
 
2768
- stream->submit([&](sycl::handler &cgh) {
2752
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2769
2753
  sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
2770
2754
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2771
2755
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
@@ -2778,9 +2762,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
2778
2762
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2779
2763
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2780
2764
 
2781
- cgh.parallel_for(
2782
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2783
- [=](sycl::nd_item<3> item_ct1) {
2765
+ sycl_parallel_for(
2766
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2784
2767
  mul_mat_q5_K<need_check>(
2785
2768
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2786
2769
  nrows_dst, item_ct1,
@@ -2803,7 +2786,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
2803
2786
  dpct::has_capability_or_fail(stream->get_device(),
2804
2787
  {sycl::aspect::fp16});
2805
2788
 
2806
- stream->submit([&](sycl::handler &cgh) {
2789
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2807
2790
  sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
2808
2791
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2809
2792
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
@@ -2816,9 +2799,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
2816
2799
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2817
2800
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2818
2801
 
2819
- cgh.parallel_for(
2820
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2821
- [=](sycl::nd_item<3> item_ct1) {
2802
+ sycl_parallel_for(
2803
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2822
2804
  mul_mat_q5_K<need_check>(
2823
2805
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2824
2806
  nrows_dst, item_ct1,
@@ -2886,7 +2868,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
2886
2868
  dpct::has_capability_or_fail(stream->get_device(),
2887
2869
  {sycl::aspect::fp16});
2888
2870
 
2889
- stream->submit([&](sycl::handler &cgh) {
2871
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2890
2872
  sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
2891
2873
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2892
2874
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
@@ -2899,9 +2881,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
2899
2881
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2900
2882
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2901
2883
 
2902
- cgh.parallel_for(
2903
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2904
- [=](sycl::nd_item<3> item_ct1) {
2884
+ sycl_parallel_for(
2885
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2905
2886
  mul_mat_q6_K<need_check>(
2906
2887
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2907
2888
  nrows_dst, item_ct1,
@@ -2924,7 +2905,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
2924
2905
  dpct::has_capability_or_fail(stream->get_device(),
2925
2906
  {sycl::aspect::fp16});
2926
2907
 
2927
- stream->submit([&](sycl::handler &cgh) {
2908
+ sycl_launch(stream, [&](sycl::handler & cgh) {
2928
2909
  sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
2929
2910
  sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
2930
2911
  sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
@@ -2937,9 +2918,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
2937
2918
  sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
2938
2919
  sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
2939
2920
 
2940
- cgh.parallel_for(
2941
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
2942
- [=](sycl::nd_item<3> item_ct1) {
2921
+ sycl_parallel_for(
2922
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2943
2923
  mul_mat_q6_K<need_check>(
2944
2924
  vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
2945
2925
  nrows_dst, item_ct1,