@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +56 -22
  11. package/cpp/build-info.cpp +2 -2
  12. package/cpp/llama.cpp/CMakeLists.txt +1 -1
  13. package/cpp/llama.cpp/common/arg.cpp +7 -0
  14. package/cpp/llama.cpp/common/common.cpp +3 -0
  15. package/cpp/llama.cpp/common/common.h +1 -0
  16. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  17. package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
  18. package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
  19. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  20. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  21. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
  22. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  23. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
  24. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  25. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  26. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  27. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  28. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  30. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  32. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  33. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  34. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  35. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  62. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
  64. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
  65. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  66. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
  67. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  68. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  69. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  70. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  71. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  72. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  73. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  74. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  76. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  77. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
  78. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  80. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  81. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  82. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  83. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  89. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  90. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  92. package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
  93. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  94. package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
  95. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
  96. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
  97. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  98. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  99. package/cpp/llama.cpp/include/llama.h +8 -3
  100. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  101. package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
  102. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  103. package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
  104. package/cpp/llama.cpp/src/llama-batch.h +98 -70
  105. package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
  106. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  107. package/cpp/llama.cpp/src/llama-context.h +13 -13
  108. package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
  109. package/cpp/llama.cpp/src/llama-graph.h +44 -32
  110. package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
  111. package/cpp/llama.cpp/src/llama-hparams.h +8 -0
  112. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
  113. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
  114. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
  115. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
  116. package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
  117. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
  118. package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
  119. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
  120. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  121. package/cpp/llama.cpp/src/llama-memory.h +18 -22
  122. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  123. package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
  124. package/cpp/llama.cpp/src/llama-model.h +22 -0
  125. package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
  126. package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
  127. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  128. package/cpp/rn-utils.h +3 -0
  129. package/ios/include/common.h +1 -0
  130. package/ios/include/llama.h +8 -3
  131. package/ios/libs/llama.xcframework/Info.plist +19 -19
  132. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  133. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  134. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  135. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  136. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
  137. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  138. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  139. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  140. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  141. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  142. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  143. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  144. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  145. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  146. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  147. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
  148. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  149. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  150. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
  151. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  152. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  153. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
  154. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  155. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  160. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  161. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  162. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  163. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  164. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
  165. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  168. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  173. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
  175. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
  178. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  183. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  184. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  186. package/package.json +1 -1
@@ -225,9 +225,9 @@ struct bin_bcast_sycl {
225
225
  dpct::has_capability_or_fail(stream->get_device(),
226
226
  {sycl::aspect::fp16});
227
227
 
228
- stream->parallel_for(
229
- sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
230
- sycl::range<3>(1, 1, block_size),
228
+ sycl_parallel_for(
229
+ stream,
230
+ sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size),
231
231
  sycl::range<3>(1, 1, block_size)),
232
232
  [=](sycl::nd_item<3> item_ct1) {
233
233
  k_bin_bcast_unravel<bin_op>(
@@ -246,9 +246,8 @@ struct bin_bcast_sycl {
246
246
  dpct::has_capability_or_fail(stream->get_device(),
247
247
  {sycl::aspect::fp16});
248
248
 
249
- stream->parallel_for(
250
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
251
- [=](sycl::nd_item<3> item_ct1) {
249
+ sycl_parallel_for(
250
+ stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
252
251
  k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
253
252
  ne2, ne3, ne10, ne11, ne12, ne13,
254
253
  s1, s2, s3, s01, s02, s03, s11, s12, s13,
@@ -199,7 +199,7 @@ struct sycl_device_info {
199
199
  // size_t smpb; // max. shared memory per block
200
200
  bool vmm; // virtual memory support
201
201
  size_t total_vram;
202
- sycl_hw_info hw_info;
202
+ //sycl_hw_info hw_info; \\ device id and aarch, currently not used
203
203
  optimize_feature opt_feature;
204
204
  };
205
205
 
@@ -286,29 +286,6 @@ struct ggml_tensor_extra_gpu {
286
286
 
287
287
  void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
288
288
 
289
- inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
290
- optimize_feature opt;
291
-
292
- opt.reorder =
293
- (arch == syclex::architecture::intel_gpu_dg1 ||
294
- arch == syclex::architecture::intel_gpu_acm_g10 ||
295
- arch == syclex::architecture::intel_gpu_acm_g11 ||
296
- arch == syclex::architecture::intel_gpu_acm_g12 ||
297
- arch == syclex::architecture::intel_gpu_pvc ||
298
- arch == syclex::architecture::intel_gpu_pvc_vg ||
299
- arch == syclex::architecture::intel_gpu_mtl_u ||
300
- arch == syclex::architecture::intel_gpu_mtl_s ||
301
- arch == syclex::architecture::intel_gpu_mtl_h ||
302
- arch == syclex::architecture::intel_gpu_arl_u ||
303
- arch == syclex::architecture::intel_gpu_arl_s ||
304
- arch == syclex::architecture::intel_gpu_arl_h ||
305
- arch == syclex::architecture::intel_gpu_bmg_g21 ||
306
- arch == syclex::architecture::intel_gpu_lnl_m
307
- );
308
-
309
- return opt;
310
- }
311
-
312
289
  namespace sycl_ex = sycl::ext::oneapi::experimental;
313
290
  struct ggml_backend_sycl_context {
314
291
  int device;
@@ -89,33 +89,24 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
89
89
  sycl::range<3> gridDim(ne2, ne1, num_blocks);
90
90
  switch (dim) {
91
91
  case 0:
92
- stream->parallel_for(
93
- sycl::nd_range<3>(gridDim *
94
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
95
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
96
- [=](sycl::nd_item<3> item_ct1) {
97
- concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
98
- });
99
- break;
92
+ sycl_parallel_for(stream,
93
+ sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
94
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
95
+ [=](sycl::nd_item<3> item_ct1) { concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); });
96
+ break;
100
97
  case 1:
101
- stream->parallel_for(
102
- sycl::nd_range<3>(gridDim *
103
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
104
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
105
- [=](sycl::nd_item<3> item_ct1) {
106
- concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
107
- });
108
- break;
98
+ sycl_parallel_for(stream,
99
+ sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
100
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
101
+ [=](sycl::nd_item<3> item_ct1) { concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); });
102
+ break;
109
103
  // dim >=2 will be dispatched to the default path
110
104
  default:
111
- stream->parallel_for(
112
- sycl::nd_range<3>(gridDim *
113
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
114
- sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
115
- [=](sycl::nd_item<3> item_ct1) {
116
- concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
117
- });
118
- break;
105
+ sycl_parallel_for(stream,
106
+ sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
107
+ sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
108
+ [=](sycl::nd_item<3> item_ct1) { concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); });
109
+ break;
119
110
  }
120
111
  }
121
112
 
@@ -129,33 +120,29 @@ static void concat_f32_sycl_non_cont(
129
120
  int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
130
121
  uint64_t nb3, int32_t dim) {
131
122
  sycl::range<3> gridDim(ne3, ne2, ne1);
132
- stream->parallel_for(
133
- sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)),
134
- [=](sycl::nd_item<3> item_ct1) {
135
- int64_t i3 = item_ct1.get_group(0);
136
- int64_t i2 = item_ct1.get_group(1);
137
- int64_t i1 = item_ct1.get_group(2);
123
+ sycl_parallel_for(stream, sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
124
+ int64_t i3 = item_ct1.get_group(0);
125
+ int64_t i2 = item_ct1.get_group(1);
126
+ int64_t i1 = item_ct1.get_group(2);
138
127
 
139
- int64_t o[4] = {0, 0, 0, 0};
140
- o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
128
+ int64_t o[4] = { 0, 0, 0, 0 };
129
+ o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
141
130
 
142
- const float *x;
131
+ const float * x;
143
132
 
144
- for (int i0 = item_ct1.get_local_id(2); i0 < ne0;
145
- i0 += item_ct1.get_local_range(2)) {
133
+ for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) {
146
134
  if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
147
- x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 +
148
- (i0)*nb00);
135
+ x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
149
136
  } else {
150
- x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 +
151
- (i1 - o[1]) * nb11 + (i0 - o[0]) * nb10);
137
+ x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
138
+ (i0 - o[0]) * nb10);
152
139
  }
153
140
 
154
141
  float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
155
142
 
156
143
  *y = *x;
157
- }
158
- });
144
+ }
145
+ });
159
146
  }
160
147
 
161
148
  void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
@@ -59,16 +59,10 @@ static void conv_transpose_1d_f32_f32_sycl(
59
59
  const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
60
60
  const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
61
61
  const sycl::range<3> block_nums(1, 1, num_blocks);
62
- stream->parallel_for(
63
- sycl::nd_range<3>(
64
- block_nums * block_dims, block_dims),
65
- [=](sycl::nd_item<3> item_ct1) {
66
- conv_transpose_1d_kernel(
67
- s0, output_size,
68
- src0_ne0, src0_ne1, src0_ne2,
69
- src1_ne0, dst_ne0,
70
- src0, src1, dst, item_ct1);
71
- });
62
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
63
+ conv_transpose_1d_kernel(s0, output_size, src0_ne0, src0_ne1, src0_ne2, src1_ne0, dst_ne0, src0, src1, dst,
64
+ item_ct1);
65
+ });
72
66
  }
73
67
 
74
68
  void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
@@ -33,14 +33,11 @@ static void dequantize_block_sycl(const void *__restrict__ vx,
33
33
  {
34
34
  dpct::has_capability_or_fail(stream->get_device(),
35
35
  {sycl::aspect::fp16});
36
- stream->parallel_for(
37
- sycl::nd_range<3>(
38
- sycl::range<3>(1, 1, num_blocks) *
39
- sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
40
- sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
41
- [=](sycl::nd_item<3> item_ct1) {
42
- dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
43
- });
36
+ sycl_parallel_for(
37
+ stream,
38
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
39
+ sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
40
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1); });
44
41
  }
45
42
  }
46
43
 
@@ -53,24 +50,18 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
53
50
  dpct::has_capability_or_fail(stream->get_device(),
54
51
  {sycl::aspect::fp16});
55
52
 
56
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
57
- sycl::range<3>(1, 1, 64),
58
- sycl::range<3>(1, 1, 64)),
59
- [=](sycl::nd_item<3> item_ct1) {
60
- dequantize_block_q2_K(vx, y, item_ct1);
61
- });
53
+ sycl_parallel_for(
54
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
55
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
62
56
  }
63
57
  #else
64
58
  {
65
59
  dpct::has_capability_or_fail(stream->get_device(),
66
60
  {sycl::aspect::fp16});
67
61
 
68
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
69
- sycl::range<3>(1, 1, 32),
70
- sycl::range<3>(1, 1, 32)),
71
- [=](sycl::nd_item<3> item_ct1) {
72
- dequantize_block_q2_K(vx, y, item_ct1);
73
- });
62
+ sycl_parallel_for(
63
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
64
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
74
65
  }
75
66
 
76
67
  #endif
@@ -85,24 +76,18 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
85
76
  dpct::has_capability_or_fail(stream->get_device(),
86
77
  {sycl::aspect::fp16});
87
78
 
88
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
89
- sycl::range<3>(1, 1, 64),
90
- sycl::range<3>(1, 1, 64)),
91
- [=](sycl::nd_item<3> item_ct1) {
92
- dequantize_block_q3_K(vx, y, item_ct1);
93
- });
79
+ sycl_parallel_for(
80
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
81
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
94
82
  }
95
83
  #else
96
84
  {
97
85
  dpct::has_capability_or_fail(stream->get_device(),
98
86
  {sycl::aspect::fp16});
99
87
 
100
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
101
- sycl::range<3>(1, 1, 32),
102
- sycl::range<3>(1, 1, 32)),
103
- [=](sycl::nd_item<3> item_ct1) {
104
- dequantize_block_q3_K(vx, y, item_ct1);
105
- });
88
+ sycl_parallel_for(
89
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
90
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
106
91
  }
107
92
  #endif
108
93
  }
@@ -116,12 +101,9 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
116
101
  dpct::has_capability_or_fail(stream->get_device(),
117
102
  {sycl::aspect::fp16});
118
103
 
119
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
120
- sycl::range<3>(1, 1, 32),
121
- sycl::range<3>(1, 1, 32)),
122
- [=](sycl::nd_item<3> item_ct1) {
123
- dequantize_block_q4_0(vx, y, nb32, item_ct1);
124
- });
104
+ sycl_parallel_for(
105
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
106
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_0(vx, y, nb32, item_ct1); });
125
107
  }
126
108
  }
127
109
 
@@ -135,13 +117,12 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int
135
117
  int constexpr WARP_K = WARP_SIZE * QK4_0;
136
118
  const int n_warp = (k + WARP_K - 1) / WARP_K;
137
119
  GGML_ASSERT(k % 2 == 0);
138
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
139
- sycl::range<3>(1, 1, WARP_SIZE),
140
- sycl::range<3>(1, 1, WARP_SIZE)),
141
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
142
- dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
143
- });
144
-
120
+ sycl_parallel_for(stream,
121
+ sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * sycl::range<3>(1, 1, WARP_SIZE),
122
+ sycl::range<3>(1, 1, WARP_SIZE)),
123
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
124
+ dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
125
+ });
145
126
  }
146
127
 
147
128
  template <typename dst_t>
@@ -153,12 +134,9 @@ static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
153
134
  dpct::has_capability_or_fail(stream->get_device(),
154
135
  {sycl::aspect::fp16});
155
136
 
156
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
157
- sycl::range<3>(1, 1, 32),
158
- sycl::range<3>(1, 1, 32)),
159
- [=](sycl::nd_item<3> item_ct1) {
160
- dequantize_block_q4_1(vx, y, nb32, item_ct1);
161
- });
137
+ sycl_parallel_for(
138
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
139
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_1(vx, y, nb32, item_ct1); });
162
140
  }
163
141
  }
164
142
 
@@ -171,14 +149,13 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
171
149
  dpct::has_capability_or_fail(stream->get_device(),
172
150
  {sycl::aspect::fp16});
173
151
 
174
- stream->submit([&](sycl::handler &cgh) {
152
+ sycl_launch(stream, [&](sycl::handler & cgh) {
175
153
  sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
176
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
177
- sycl::range<3>(1, 1, 32),
178
- sycl::range<3>(1, 1, 32)),
179
- [=](sycl::nd_item<3> item_ct1) {
180
- dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
181
- });
154
+ sycl_parallel_for(
155
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
156
+ [=](sycl::nd_item<3> item_ct1) {
157
+ dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
158
+ });
182
159
  });
183
160
  }
184
161
  }
@@ -191,13 +168,13 @@ static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const i
191
168
 
192
169
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
193
170
 
194
- stream->submit([&](sycl::handler & cgh) {
171
+ sycl_launch(stream, [&](sycl::handler & cgh) {
195
172
  sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
196
173
 
197
- cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
198
- [=](sycl::nd_item<1> item_ct1) {
199
- dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
200
- });
174
+ sycl_parallel_for<1>(cgh, sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
175
+ [=](sycl::nd_item<1> item_ct1) {
176
+ dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
177
+ });
201
178
  });
202
179
  }
203
180
 
@@ -210,24 +187,18 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
210
187
  dpct::has_capability_or_fail(stream->get_device(),
211
188
  {sycl::aspect::fp16});
212
189
 
213
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
214
- sycl::range<3>(1, 1, 64),
215
- sycl::range<3>(1, 1, 64)),
216
- [=](sycl::nd_item<3> item_ct1) {
217
- dequantize_block_q5_K(vx, y, item_ct1);
218
- });
190
+ sycl_parallel_for(
191
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
192
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
219
193
  }
220
194
  #else
221
195
  {
222
196
  dpct::has_capability_or_fail(stream->get_device(),
223
197
  {sycl::aspect::fp16});
224
198
 
225
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
226
- sycl::range<3>(1, 1, 32),
227
- sycl::range<3>(1, 1, 32)),
228
- [=](sycl::nd_item<3> item_ct1) {
229
- dequantize_block_q5_K(vx, y, item_ct1);
230
- });
199
+ sycl_parallel_for(
200
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
201
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
231
202
  }
232
203
 
233
204
  #endif
@@ -242,24 +213,18 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
242
213
  dpct::has_capability_or_fail(stream->get_device(),
243
214
  {sycl::aspect::fp16});
244
215
 
245
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
246
- sycl::range<3>(1, 1, 64),
247
- sycl::range<3>(1, 1, 64)),
248
- [=](sycl::nd_item<3> item_ct1) {
249
- dequantize_block_q6_K(vx, y, item_ct1);
250
- });
216
+ sycl_parallel_for(
217
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
218
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
251
219
  }
252
220
  #else
253
221
  {
254
222
  dpct::has_capability_or_fail(stream->get_device(),
255
223
  {sycl::aspect::fp16});
256
224
 
257
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
258
- sycl::range<3>(1, 1, 32),
259
- sycl::range<3>(1, 1, 32)),
260
- [=](sycl::nd_item<3> item_ct1) {
261
- dequantize_block_q6_K(vx, y, item_ct1);
262
- });
225
+ sycl_parallel_for(
226
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
227
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
263
228
  }
264
229
 
265
230
  #endif
@@ -271,9 +236,9 @@ static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const i
271
236
 
272
237
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
273
238
 
274
- stream->parallel_for(
275
- sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
276
- [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
239
+ sycl_parallel_for(stream,
240
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
241
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
277
242
  }
278
243
 
279
244
  template <typename dst_t>
@@ -284,15 +249,10 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
284
249
  dpct::has_capability_or_fail(stream->get_device(),
285
250
  {sycl::aspect::fp16});
286
251
 
287
- stream->submit([&](sycl::handler &cgh) {
288
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
289
- sycl::range<3>(1, 1, 32),
290
- sycl::range<3>(1, 1, 32)),
291
- [=](sycl::nd_item<3> item_ct1) {
292
- dequantize_block_iq1_s(
293
- vx, y, item_ct1, iq1s_grid_gpu
294
- );
295
- });
252
+ sycl_launch(stream, [&](sycl::handler & cgh) {
253
+ sycl_parallel_for(
254
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
255
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_s(vx, y, item_ct1, iq1s_grid_gpu); });
296
256
  });
297
257
  }
298
258
  }
@@ -305,15 +265,10 @@ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
305
265
  dpct::has_capability_or_fail(stream->get_device(),
306
266
  {sycl::aspect::fp16});
307
267
 
308
- stream->submit([&](sycl::handler &cgh) {
309
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
310
- sycl::range<3>(1, 1, 32),
311
- sycl::range<3>(1, 1, 32)),
312
- [=](sycl::nd_item<3> item_ct1) {
313
- dequantize_block_iq1_m(
314
- vx, y, item_ct1, iq1s_grid_gpu
315
- );
316
- });
268
+ sycl_launch(stream, [&](sycl::handler & cgh) {
269
+ sycl_parallel_for(
270
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
271
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_m(vx, y, item_ct1, iq1s_grid_gpu); });
317
272
  });
318
273
  }
319
274
  }
@@ -326,15 +281,12 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t
326
281
  dpct::has_capability_or_fail(stream->get_device(),
327
282
  {sycl::aspect::fp16});
328
283
 
329
- stream->submit([&](sycl::handler &cgh) {
330
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
331
- sycl::range<3>(1, 1, 32),
332
- sycl::range<3>(1, 1, 32)),
333
- [=](sycl::nd_item<3> item_ct1) {
334
- dequantize_block_iq2_xxs(
335
- vx, y, item_ct1, iq2xxs_grid,
336
- ksigns_iq2xs, kmask_iq2xs);
337
- });
284
+ sycl_launch(stream, [&](sycl::handler & cgh) {
285
+ sycl_parallel_for(
286
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
287
+ [=](sycl::nd_item<3> item_ct1) {
288
+ dequantize_block_iq2_xxs(vx, y, item_ct1, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
289
+ });
338
290
  });
339
291
  }
340
292
  }
@@ -347,15 +299,12 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k
347
299
  dpct::has_capability_or_fail(stream->get_device(),
348
300
  {sycl::aspect::fp16});
349
301
 
350
- stream->submit([&](sycl::handler &cgh) {
351
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
352
- sycl::range<3>(1, 1, 32),
353
- sycl::range<3>(1, 1, 32)),
354
- [=](sycl::nd_item<3> item_ct1) {
355
- dequantize_block_iq2_xs(
356
- vx, y, item_ct1, iq2xs_grid,
357
- ksigns_iq2xs, kmask_iq2xs);
358
- });
302
+ sycl_launch(stream, [&](sycl::handler & cgh) {
303
+ sycl_parallel_for(
304
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
305
+ [=](sycl::nd_item<3> item_ct1) {
306
+ dequantize_block_iq2_xs(vx, y, item_ct1, iq2xs_grid, ksigns_iq2xs, kmask_iq2xs);
307
+ });
359
308
  });
360
309
  }
361
310
  }
@@ -368,13 +317,10 @@ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
368
317
  dpct::has_capability_or_fail(stream->get_device(),
369
318
  {sycl::aspect::fp16});
370
319
 
371
- stream->submit([&](sycl::handler &cgh) {
372
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
373
- sycl::range<3>(1, 1, 32),
374
- sycl::range<3>(1, 1, 32)),
375
- [=](sycl::nd_item<3> item_ct1) {
376
- dequantize_block_iq2_s(vx, y, item_ct1);
377
- });
320
+ sycl_launch(stream, [&](sycl::handler & cgh) {
321
+ sycl_parallel_for(
322
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
323
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_s(vx, y, item_ct1); });
378
324
  });
379
325
  }
380
326
  }
@@ -388,15 +334,12 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t
388
334
  dpct::has_capability_or_fail(stream->get_device(),
389
335
  {sycl::aspect::fp16});
390
336
 
391
- stream->submit([&](sycl::handler &cgh) {
392
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
393
- sycl::range<3>(1, 1, 32),
394
- sycl::range<3>(1, 1, 32)),
395
- [=](sycl::nd_item<3> item_ct1) {
396
- dequantize_block_iq3_xxs(
397
- vx, y, item_ct1, iq3xxs_grid,
398
- ksigns_iq2xs, kmask_iq2xs);
399
- });
337
+ sycl_launch(stream, [&](sycl::handler & cgh) {
338
+ sycl_parallel_for(
339
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
340
+ [=](sycl::nd_item<3> item_ct1) {
341
+ dequantize_block_iq3_xxs(vx, y, item_ct1, iq3xxs_grid, ksigns_iq2xs, kmask_iq2xs);
342
+ });
400
343
  });
401
344
  }
402
345
  }
@@ -409,14 +352,10 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
409
352
  dpct::has_capability_or_fail(stream->get_device(),
410
353
  {sycl::aspect::fp16});
411
354
 
412
- stream->submit([&](sycl::handler &cgh) {
413
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
414
- sycl::range<3>(1, 1, 32),
415
- sycl::range<3>(1, 1, 32)),
416
- [=](sycl::nd_item<3> item_ct1) {
417
- dequantize_block_iq3_s(
418
- vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
419
- });
355
+ sycl_launch(stream, [&](sycl::handler & cgh) {
356
+ sycl_parallel_for(
357
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
358
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_s(vx, y, item_ct1, kmask_iq2xs, iq3s_grid); });
420
359
  });
421
360
  }
422
361
  }
@@ -432,14 +371,11 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k
432
371
  dpct::has_capability_or_fail(stream->get_device(),
433
372
  {sycl::aspect::fp16});
434
373
 
435
- stream->submit([&](sycl::handler &cgh) {
436
- cgh.parallel_for(
437
- sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
438
- sycl::range<3>(1, 1, 32),
439
- sycl::range<3>(1, 1, 32)),
440
- [=](sycl::nd_item<3> item_ct1) {
441
- dequantize_block_iq4_xs(vx, y, item_ct1);
442
- });
374
+ sycl_launch(stream, [&](sycl::handler & cgh) {
375
+ sycl_parallel_for(
376
+ cgh,
377
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
378
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_xs(vx, y, item_ct1); });
443
379
  });
444
380
  }
445
381
  #endif
@@ -453,14 +389,11 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
453
389
  dpct::has_capability_or_fail(stream->get_device(),
454
390
  {sycl::aspect::fp16});
455
391
 
456
- stream->submit([&](sycl::handler &cgh) {
457
- cgh.parallel_for(
458
- sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
459
- sycl::range<3>(1, 1, 32),
460
- sycl::range<3>(1, 1, 32)),
461
- [=](sycl::nd_item<3> item_ct1) {
462
- dequantize_block_iq4_nl(vx, y, item_ct1);
463
- });
392
+ sycl_launch(stream, [&](sycl::handler & cgh) {
393
+ sycl_parallel_for(
394
+ cgh,
395
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
396
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_nl(vx, y, item_ct1); });
464
397
  });
465
398
  }
466
399
  }