whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
231
231
  return { type, major, minor, patch };
232
232
  }
233
233
 
234
+ // Profiling
235
+ struct ProfilingInfo {
236
+ std::string op_name;
237
+ std::string kernel_name;
238
+
239
+ cl_kernel kernel;
240
+ cl_event evt;
241
+
242
+ cl_ulong cmd_queued;
243
+ cl_ulong cmd_submit;
244
+ cl_ulong cmd_start;
245
+ cl_ulong cmd_end;
246
+ cl_ulong overhead_start;
247
+ cl_ulong overhead_end;
248
+ // For the times below, see spec for clGetEventProfilingInfo
249
+ // The time kernel spent in cmd queue - SUBMIT - QUEUED
250
+ cl_ulong cmd_queued_duration_ns;
251
+ // The time kernel spent for submission - START - SUBMIT
252
+ cl_ulong cmd_submit_duration_ns;
253
+ // Kernel execution time in nanoseconds - END - START
254
+ cl_ulong cmd_duration_ns;
255
+ // The time for the kernel to complete - COMPLETE - END
256
+ cl_ulong cmd_complete_duration_ns;
257
+ // Total time to finish the kernel - COMPELTE - QUEUED
258
+ cl_ulong cmd_total_duration_ns;
259
+ // Global and local work sizes.
260
+ size_t global_size[3];
261
+ size_t local_size[3];
262
+ // Op output size.
263
+ size_t output_size[4];
264
+ };
265
+
266
+ static void populateProfilingInfo(
267
+ ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
268
+ size_t global_size[3], size_t local_size[3],
269
+ const ggml_tensor * tensor) {
270
+ info.op_name = tensor->name;
271
+ info.kernel = kernel;
272
+ info.evt = evt;
273
+
274
+ // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
275
+ info.local_size[0] = 0;
276
+ info.local_size[1] = 0;
277
+ info.local_size[2] = 0;
278
+
279
+ info.global_size[0] = 0;
280
+ info.global_size[1] = 0;
281
+ info.global_size[2] = 0;
282
+
283
+ if (local_size) {
284
+ for (cl_uint i = 0; i < work_dim; ++i) {
285
+ info.local_size[i] = local_size[i];
286
+ }
287
+ }
288
+
289
+ for (cl_uint i = 0; i < work_dim; ++i) {
290
+ info.global_size[i] = global_size[i];
291
+ }
292
+
293
+ info.output_size[0] = tensor->ne[0];
294
+ info.output_size[1] = tensor->ne[1];
295
+ info.output_size[2] = tensor->ne[2];
296
+ info.output_size[3] = tensor->ne[3];
297
+ }
298
+
234
299
  struct ggml_backend_opencl_context;
235
300
 
236
301
  // backend device context
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
254
319
 
255
320
  // backend context
256
321
  struct ggml_backend_opencl_context {
322
+ int ref_count;
323
+
257
324
  cl_device_id device;
258
325
  std::string device_name;
259
326
 
@@ -284,6 +351,7 @@ struct ggml_backend_opencl_context {
284
351
  cl_program program_gemv_noshuffle_general;
285
352
  cl_program program_gemv_noshuffle;
286
353
  cl_program program_get_rows;
354
+ cl_program program_glu;
287
355
  cl_program program_im2col_f16;
288
356
  cl_program program_im2col_f32;
289
357
  cl_program program_mul_mat_Ab_Bi_8x4;
@@ -299,27 +367,46 @@ struct ggml_backend_opencl_context {
299
367
  cl_program program_mul_mv_f16_f32;
300
368
  cl_program program_mul_mv_f32_f32;
301
369
  cl_program program_mul;
370
+ cl_program program_div;
371
+ cl_program program_sub;
302
372
  cl_program program_norm;
303
373
  cl_program program_relu;
304
374
  cl_program program_rms_norm;
375
+ cl_program program_group_norm;
305
376
  cl_program program_rope;
306
377
  cl_program program_scale;
307
378
  cl_program program_silu;
379
+ cl_program program_sigmoid;
308
380
  cl_program program_softmax_f32;
309
381
  cl_program program_softmax_f16;
310
382
  cl_program program_softmax_4_f32;
311
383
  cl_program program_softmax_4_f16;
384
+ cl_program program_argsort_f32_i32;
385
+ cl_program program_sum_rows_f32;
386
+ cl_program program_repeat;
387
+ cl_program program_pad;
388
+ cl_program program_tanh;
389
+ cl_program program_upscale;
390
+ cl_program program_concat;
391
+ cl_program program_tsembd;
392
+ cl_program program_mul_mv_id_q4_0_f32_8x_flat;
312
393
 
313
394
  cl_kernel kernel_add, kernel_add_row;
314
395
  cl_kernel kernel_mul, kernel_mul_row;
396
+ cl_kernel kernel_div, kernel_div_row;
397
+ cl_kernel kernel_sub, kernel_sub_row;
315
398
  cl_kernel kernel_scale;
316
399
  cl_kernel kernel_silu, kernel_silu_4;
317
400
  cl_kernel kernel_gelu, kernel_gelu_4;
318
401
  cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
319
402
  cl_kernel kernel_relu;
403
+ cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
320
404
  cl_kernel kernel_clamp;
405
+ cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu,
406
+ kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16;
321
407
  cl_kernel kernel_norm;
322
408
  cl_kernel kernel_rms_norm;
409
+ cl_kernel kernel_group_norm;
323
410
  cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
324
411
  cl_kernel kernel_soft_max, kernel_soft_max_4;
325
412
  cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
@@ -339,6 +426,120 @@ struct ggml_backend_opencl_context {
339
426
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
340
427
  cl_kernel kernel_mul_mv_q6_K_f32;
341
428
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
429
+ cl_kernel kernel_argsort_f32_i32;
430
+ cl_kernel kernel_sum_rows_f32;
431
+ cl_kernel kernel_repeat;
432
+ cl_kernel kernel_pad;
433
+ cl_kernel kernel_tanh_f32_nd;
434
+ cl_kernel kernel_tanh_f16_nd;
435
+ cl_kernel kernel_upscale;
436
+ cl_kernel kernel_upscale_bilinear;
437
+ cl_kernel kernel_concat_f32_contiguous;
438
+ cl_kernel kernel_concat_f32_non_contiguous;
439
+ cl_kernel kernel_timestep_embedding;
440
+ cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
441
+
442
+ std::vector<ProfilingInfo> profiling_info;
443
+
444
+ void write_profiling_info() {
445
+ FILE * fperf = fopen("cl_profiling.csv", "w");
446
+ if (!fperf) {
447
+ GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
448
+ return;
449
+ }
450
+
451
+ // Populate profiling info
452
+ for (ProfilingInfo & info : profiling_info) {
453
+ cl_ulong cmd_queued;
454
+ cl_ulong cmd_submit;
455
+ cl_ulong cmd_start;
456
+ cl_ulong cmd_end;
457
+ cl_ulong cmd_complete;
458
+
459
+ CL_CHECK(clWaitForEvents(1, &info.evt));
460
+ CL_CHECK(clGetEventProfilingInfo(
461
+ info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
462
+ CL_CHECK(clGetEventProfilingInfo(
463
+ info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
464
+ CL_CHECK(clGetEventProfilingInfo(
465
+ info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
466
+ CL_CHECK(clGetEventProfilingInfo(
467
+ info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
468
+ CL_CHECK(clGetEventProfilingInfo(
469
+ info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
470
+ CL_CHECK(clReleaseEvent(info.evt));
471
+
472
+ char kernel_name[512];
473
+ CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
474
+ sizeof(kernel_name), kernel_name, NULL));
475
+ info.kernel_name = kernel_name;
476
+
477
+ info.cmd_queued = cmd_queued;
478
+ info.cmd_submit = cmd_submit;
479
+ info.cmd_start = cmd_start;
480
+ info.cmd_end = cmd_end;
481
+
482
+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
483
+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
484
+ info.cmd_duration_ns = cmd_end - cmd_start;
485
+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
486
+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
487
+ }
488
+
489
+ // Dump a csv
490
+ float total_kernel_time = 0;
491
+ fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
492
+ for (const ProfilingInfo & info : profiling_info) {
493
+ total_kernel_time += info.cmd_duration_ns/1.e6f;
494
+ fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
495
+ info.op_name.c_str(), info.kernel_name.c_str(),
496
+ info.cmd_queued_duration_ns/1.e6f,
497
+ info.cmd_submit_duration_ns/1.e6f,
498
+ info.cmd_duration_ns/1.e6f,
499
+ info.cmd_complete_duration_ns/1.e6f,
500
+ info.cmd_total_duration_ns/1.e6f,
501
+ info.global_size[0], info.global_size[1], info.global_size[2],
502
+ info.local_size[0], info.local_size[1], info.local_size[2],
503
+ info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
504
+ }
505
+ fclose(fperf);
506
+
507
+ GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
508
+
509
+ // Dump a simple chrome trace
510
+ FILE* ftrace = fopen("cl_trace.json", "w");
511
+ if (!ftrace) {
512
+ GGML_LOG_ERROR("Failed to open cl_trace.json\n");
513
+ return;
514
+ }
515
+
516
+ fprintf(ftrace, "[\n");
517
+ for (const ProfilingInfo & info : profiling_info) {
518
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
519
+ info.kernel_name.c_str(), info.cmd_queued/1000);
520
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
521
+ info.kernel_name.c_str(), info.cmd_submit/1000);
522
+
523
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
524
+ info.kernel_name.c_str(), info.cmd_start/1000);
525
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
526
+ info.kernel_name.c_str(), info.cmd_end/1000);
527
+ }
528
+ fclose(ftrace);
529
+ }
530
+
531
+ void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
532
+ #ifdef GGML_OPENCL_PROFILING
533
+ cl_event evt;
534
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
535
+
536
+ profiling_info.emplace_back();
537
+ populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
538
+ #else
539
+ GGML_UNUSED(tensor);
540
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
541
+ #endif
542
+ }
342
543
 
343
544
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
344
545
  // Transpose kernels
@@ -366,46 +567,19 @@ struct ggml_backend_opencl_context {
366
567
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
367
568
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
368
569
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
369
- };
370
-
371
- // All registered devices with a default device in the front.
372
- static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
373
570
 
374
- // Profiling
571
+ void free() {
572
+ ref_count--;
573
+ if (ref_count == 0) {
375
574
  #ifdef GGML_OPENCL_PROFILING
376
- struct ProfilingInfo {
377
- std::string op_name;
378
- std::string kernel_name;
379
-
380
- cl_kernel kernel;
381
- cl_event evt;
382
-
383
- cl_ulong cmd_queued;
384
- cl_ulong cmd_submit;
385
- cl_ulong cmd_start;
386
- cl_ulong cmd_end;
387
- cl_ulong overhead_start;
388
- cl_ulong overhead_end;
389
- // For the times below, see spec for clGetEventProfilingInfo
390
- // The time kernel spent in cmd queue - SUBMIT - QUEUED
391
- cl_ulong cmd_queued_duration_ns;
392
- // The time kernel spent for submission - START - SUBMIT
393
- cl_ulong cmd_submit_duration_ns;
394
- // Kernel execution time in nanoseconds - END - START
395
- cl_ulong cmd_duration_ns;
396
- // The time for the kernel to complete - COMPLETE - END
397
- cl_ulong cmd_complete_duration_ns;
398
- // Total time to finish the kernel - COMPELTE - QUEUED
399
- cl_ulong cmd_total_duration_ns;
400
- // Global and local work sizes.
401
- size_t global_size[3];
402
- size_t local_size[3];
403
- // Op output size.
404
- size_t output_size[4];
575
+ write_profiling_info();
576
+ #endif
577
+ }
578
+ }
405
579
  };
406
580
 
407
- std::vector<ProfilingInfo> g_profiling_info;
408
- #endif
581
+ // All registered devices with a default device in the front.
582
+ static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
409
583
 
410
584
  inline std::string read_file(const std::string &path) {
411
585
  std::ifstream ifs(path);
@@ -567,6 +741,27 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
567
741
  GGML_LOG_CONT(".");
568
742
  }
569
743
 
744
+ // glu
745
+ {
746
+ #ifdef GGML_OPENCL_EMBED_KERNELS
747
+ const std::string kernel_src {
748
+ #include "glu.cl.h"
749
+ };
750
+ #else
751
+ const std::string kernel_src = read_file("glu.cl");
752
+ #endif
753
+ backend_ctx->program_glu =
754
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
755
+
756
+ CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
757
+ CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
758
+ CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
759
+ CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
760
+ CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
761
+ CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
762
+ GGML_LOG_CONT(".");
763
+ }
764
+
570
765
  // get_rows
571
766
  {
572
767
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -986,152 +1181,411 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
986
1181
  GGML_LOG_CONT(".");
987
1182
  }
988
1183
 
989
- // Adreno kernels
990
- #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
991
- // transpose
1184
+ // argsort
992
1185
  {
993
1186
  #ifdef GGML_OPENCL_EMBED_KERNELS
994
1187
  const std::string kernel_src {
995
- #include "transpose.cl.h"
1188
+ #include "argsort.cl.h"
996
1189
  };
997
1190
  #else
998
- const std::string kernel_src = read_file("transpose.cl");
1191
+ const std::string kernel_src = read_file("argsort.cl");
999
1192
  #endif
1000
- backend_ctx->program_transpose =
1193
+ backend_ctx->program_argsort_f32_i32 =
1001
1194
  build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1002
1195
 
1003
- CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
1004
- CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
1005
- CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
1196
+ CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
1006
1197
  GGML_LOG_CONT(".");
1007
1198
  }
1008
1199
 
1009
- // gemv_noshuffle_general
1200
+ // div
1010
1201
  {
1011
- std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1012
- " -cl-mad-enable "
1013
- " -DSIMDGROUP_WIDTH=" +
1014
- std::to_string(backend_ctx->adreno_wave_size);
1015
- if (backend_ctx->has_vector_subgroup_broadcast) {
1016
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1017
- }
1018
-
1019
1202
  #ifdef GGML_OPENCL_EMBED_KERNELS
1020
- const std::string kernel_src_CL_gemv_general {
1021
- #include "gemv_noshuffle_general.cl.h"
1203
+ const std::string kernel_src {
1204
+ #include "div.cl.h"
1022
1205
  };
1023
1206
  #else
1024
- const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general.cl");
1207
+ const std::string kernel_src = read_file("div.cl");
1025
1208
  #endif
1209
+ backend_ctx->program_div =
1210
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1026
1211
 
1027
- backend_ctx->program_CL_gemv_general = build_program_from_source(
1028
- backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
1029
-
1030
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
1212
+ CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
1213
+ CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
1031
1214
  GGML_LOG_CONT(".");
1032
1215
  }
1033
1216
 
1034
- // gemv_noshuffle
1217
+ // sub
1035
1218
  {
1036
- // Gemv 2048, 16384
1037
- std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1038
- " -cl-mad-enable "
1039
- " -DLINE_STRIDE_A=2048 "
1040
- " -DBLOCK_STRIDE_A=16384 "
1041
- " -DSIMDGROUP_WIDTH=" +
1042
- std::to_string(backend_ctx->adreno_wave_size);
1043
- if (backend_ctx->has_vector_subgroup_broadcast) {
1044
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1045
- }
1046
-
1047
1219
  #ifdef GGML_OPENCL_EMBED_KERNELS
1048
- const std::string kernel_src_CL_gemv {
1049
- #include "gemv_noshuffle.cl.h"
1220
+ const std::string kernel_src {
1221
+ #include "sub.cl.h"
1050
1222
  };
1051
1223
  #else
1052
- const std::string kernel_src_CL_gemv = read_file("gemv_noshuffle.cl");
1224
+ const std::string kernel_src = read_file("sub.cl");
1053
1225
  #endif
1226
+ backend_ctx->program_sub =
1227
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1054
1228
 
1055
- backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
1056
- backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1057
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
1229
+ CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
1230
+ CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
1058
1231
  GGML_LOG_CONT(".");
1232
+ }
1059
1233
 
1060
- // Gemv 2048, 16384
1061
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1062
- " -cl-mad-enable "
1063
- " -DLINE_STRIDE_A=2048 "
1064
- " -DBLOCK_STRIDE_A=16384 "
1065
- " -DSIMDGROUP_WIDTH=" +
1066
- std::to_string(backend_ctx->adreno_wave_size);
1067
- if (backend_ctx->has_vector_subgroup_broadcast) {
1068
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1069
- }
1234
+ // sum_rows
1235
+ {
1236
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1237
+ const std::string kernel_src {
1238
+ #include "sum_rows.cl.h"
1239
+ };
1240
+ #else
1241
+ const std::string kernel_src = read_file("sum_rows.cl");
1242
+ #endif
1243
+ backend_ctx->program_sum_rows_f32 =
1244
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1070
1245
 
1071
- backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
1072
- backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1073
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
1246
+ CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
1074
1247
  GGML_LOG_CONT(".");
1248
+ }
1075
1249
 
1076
- // Gemv 5504, 44032
1077
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1078
- " -cl-mad-enable "
1079
- " -DLINE_STRIDE_A=5504 "
1080
- " -DBLOCK_STRIDE_A=44032 "
1081
- " -DSIMDGROUP_WIDTH=" +
1082
- std::to_string(backend_ctx->adreno_wave_size);
1083
- if (backend_ctx->has_vector_subgroup_broadcast) {
1084
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1085
- }
1250
+ // sigmoid
1251
+ {
1252
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1253
+ const std::string kernel_src {
1254
+ #include "sigmoid.cl.h"
1255
+ };
1256
+ #else
1257
+ const std::string kernel_src = read_file("sigmoid.cl");
1258
+ #endif
1259
+ backend_ctx->program_sigmoid =
1260
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1086
1261
 
1087
- backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
1088
- backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1089
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
1262
+ CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
1263
+ CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
1090
1264
  GGML_LOG_CONT(".");
1265
+ }
1091
1266
 
1092
- // Gemv 16000, 128000
1093
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1094
- " -cl-mad-enable "
1095
- " -DLINE_STRIDE_A=16000 "
1096
- " -DBLOCK_STRIDE_A=128000 "
1097
- " -DSIMDGROUP_WIDTH=" +
1098
- std::to_string(backend_ctx->adreno_wave_size);
1099
-
1100
- if (backend_ctx->has_vector_subgroup_broadcast) {
1101
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1102
- }
1267
+ // group_norm
1268
+ {
1269
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1270
+ const std::string kernel_src {
1271
+ #include "group_norm.cl.h"
1272
+ };
1273
+ #else
1274
+ const std::string kernel_src = read_file("group_norm.cl");
1275
+ #endif
1276
+ backend_ctx->program_group_norm =
1277
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1103
1278
 
1104
- backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(
1105
- backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1106
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
1279
+ CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
1107
1280
  GGML_LOG_CONT(".");
1108
1281
  }
1109
1282
 
1110
- // mul_mat_Ab_Bi_8x4
1283
+ // repeat
1111
1284
  {
1112
1285
  #ifdef GGML_OPENCL_EMBED_KERNELS
1113
- const std::string kernel_src_CL_gemm {
1114
- #include "mul_mat_Ab_Bi_8x4.cl.h"
1286
+ const std::string kernel_src {
1287
+ #include "repeat.cl.h"
1115
1288
  };
1116
1289
  #else
1117
- const std::string kernel_src_CL_gemm = read_file("mul_mat_Ab_Bi_8x4.cl");
1290
+ const std::string kernel_src = read_file("repeat.cl");
1118
1291
  #endif
1119
- backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_CL_gemm.c_str(), compile_opts);
1120
- CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
1121
- GGML_LOG_CONT(".");
1292
+ if (!kernel_src.empty()) {
1293
+ backend_ctx->program_repeat =
1294
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1295
+ CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
1296
+ GGML_LOG_CONT(".");
1297
+ } else {
1298
+ GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
1299
+ backend_ctx->program_repeat = nullptr;
1300
+ backend_ctx->kernel_repeat = nullptr;
1301
+ }
1122
1302
  }
1123
- #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1124
- GGML_LOG_CONT("\n");
1125
- }
1126
1303
 
1127
- // XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1128
- // XXX static bool initialized = false;
1129
- // XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
1130
-
1131
- static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
1132
-
1133
- namespace /* anonymous */ {
1134
- extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
1304
+ // pad
1305
+ {
1306
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1307
+ const std::string kernel_src {
1308
+ #include "pad.cl.h"
1309
+ };
1310
+ #else
1311
+ const std::string kernel_src = read_file("pad.cl");
1312
+ #endif
1313
+ if (!kernel_src.empty()) {
1314
+ backend_ctx->program_pad =
1315
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1316
+ CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err));
1317
+ GGML_LOG_CONT(".");
1318
+ } else {
1319
+ GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n");
1320
+ backend_ctx->program_pad = nullptr;
1321
+ backend_ctx->kernel_pad = nullptr;
1322
+ }
1323
+ }
1324
+
1325
+ // tanh
1326
+ {
1327
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1328
+ const std::string kernel_src {
1329
+ #include "tanh.cl.h"
1330
+ };
1331
+ #else
1332
+ const std::string kernel_src = read_file("tanh.cl");
1333
+ #endif
1334
+ if (!kernel_src.empty()) {
1335
+ backend_ctx->program_tanh =
1336
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1337
+ CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
1338
+ CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
1339
+ GGML_LOG_CONT(".");
1340
+ } else {
1341
+ GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
1342
+ backend_ctx->program_tanh = nullptr;
1343
+ backend_ctx->kernel_tanh_f32_nd = nullptr;
1344
+ backend_ctx->kernel_tanh_f16_nd = nullptr;
1345
+ }
1346
+ }
1347
+
1348
+ // upscale
1349
+ {
1350
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1351
+ const std::string kernel_src {
1352
+ #include "upscale.cl.h"
1353
+ };
1354
+ #else
1355
+ const std::string kernel_src = read_file("upscale.cl");
1356
+ #endif
1357
+ if (!kernel_src.empty()) {
1358
+ backend_ctx->program_upscale =
1359
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1360
+ CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
1361
+ if (backend_ctx->program_upscale) {
1362
+ cl_int err_bilinear;
1363
+ backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
1364
+ if (err_bilinear != CL_SUCCESS) {
1365
+ GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
1366
+ backend_ctx->kernel_upscale_bilinear = nullptr;
1367
+ }
1368
+ } else {
1369
+ backend_ctx->kernel_upscale_bilinear = nullptr;
1370
+ }
1371
+ GGML_LOG_CONT(".");
1372
+ } else {
1373
+ GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n");
1374
+ backend_ctx->program_upscale = nullptr;
1375
+ backend_ctx->kernel_upscale = nullptr;
1376
+ backend_ctx->kernel_upscale_bilinear = nullptr;
1377
+ }
1378
+ }
1379
+
1380
+ // concat
1381
+ {
1382
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1383
+ const std::string kernel_src {
1384
+ #include "concat.cl.h"
1385
+ };
1386
+ #else
1387
+
1388
+ const std::string kernel_src = read_file("concat.cl");
1389
+ #endif
1390
+ if (!kernel_src.empty()) {
1391
+ backend_ctx->program_concat =
1392
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1393
+
1394
+ CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
1395
+ CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
1396
+ GGML_LOG_CONT(".");
1397
+ } else {
1398
+ GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
1399
+ backend_ctx->program_concat = nullptr;
1400
+ backend_ctx->kernel_concat_f32_contiguous = nullptr;
1401
+ backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
1402
+ }
1403
+ }
1404
+
1405
+ // timestep_embedding
1406
+ {
1407
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1408
+ const std::string kernel_src {
1409
+ #include "tsembd.cl.h"
1410
+ };
1411
+ #else
1412
+
1413
+ const std::string kernel_src = read_file("tsembd.cl");
1414
+ #endif
1415
+ if (!kernel_src.empty()) {
1416
+ backend_ctx->program_tsembd =
1417
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1418
+ CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
1419
+ GGML_LOG_CONT(".");
1420
+ } else {
1421
+ GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
1422
+ backend_ctx->program_tsembd = nullptr;
1423
+ backend_ctx->kernel_timestep_embedding = nullptr;
1424
+ }
1425
+ }
1426
+
1427
+ // mul_mv_id_q4_0_f32_8x_flat
1428
+ {
1429
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1430
+ const std::string kernel_src {
1431
+ #include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
1432
+ };
1433
+ #else
1434
+ const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
1435
+ #endif
1436
+ backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
1437
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1438
+
1439
+ CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
1440
+ GGML_LOG_CONT(".");
1441
+ }
1442
+
1443
+ // Adreno kernels
1444
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1445
+ // transpose
1446
+ {
1447
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1448
+ const std::string kernel_src {
1449
+ #include "transpose.cl.h"
1450
+ };
1451
+ #else
1452
+ const std::string kernel_src = read_file("transpose.cl");
1453
+ #endif
1454
+ backend_ctx->program_transpose =
1455
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1456
+
1457
+ CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
1458
+ CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
1459
+ CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
1460
+ GGML_LOG_CONT(".");
1461
+ }
1462
+
1463
+ // gemv_noshuffle_general
1464
+ {
1465
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1466
+ " -cl-mad-enable "
1467
+ " -DSIMDGROUP_WIDTH=" +
1468
+ std::to_string(backend_ctx->adreno_wave_size);
1469
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1470
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1471
+ }
1472
+
1473
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1474
+ const std::string kernel_src_CL_gemv_general {
1475
+ #include "gemv_noshuffle_general.cl.h"
1476
+ };
1477
+ #else
1478
+ const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general.cl");
1479
+ #endif
1480
+
1481
+ backend_ctx->program_CL_gemv_general = build_program_from_source(
1482
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
1483
+
1484
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
1485
+ GGML_LOG_CONT(".");
1486
+ }
1487
+
1488
+ // gemv_noshuffle
1489
+ {
1490
+ // Gemv 2048, 16384
1491
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1492
+ " -cl-mad-enable "
1493
+ " -DLINE_STRIDE_A=2048 "
1494
+ " -DBLOCK_STRIDE_A=16384 "
1495
+ " -DSIMDGROUP_WIDTH=" +
1496
+ std::to_string(backend_ctx->adreno_wave_size);
1497
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1498
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1499
+ }
1500
+
1501
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1502
+ const std::string kernel_src_CL_gemv {
1503
+ #include "gemv_noshuffle.cl.h"
1504
+ };
1505
+ #else
1506
+ const std::string kernel_src_CL_gemv = read_file("gemv_noshuffle.cl");
1507
+ #endif
1508
+
1509
+ backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
1510
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1511
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
1512
+ GGML_LOG_CONT(".");
1513
+
1514
+ // Gemv 2048, 16384
1515
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1516
+ " -cl-mad-enable "
1517
+ " -DLINE_STRIDE_A=2048 "
1518
+ " -DBLOCK_STRIDE_A=16384 "
1519
+ " -DSIMDGROUP_WIDTH=" +
1520
+ std::to_string(backend_ctx->adreno_wave_size);
1521
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1522
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1523
+ }
1524
+
1525
+ backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
1526
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1527
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
1528
+ GGML_LOG_CONT(".");
1529
+
1530
+ // Gemv 5504, 44032
1531
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1532
+ " -cl-mad-enable "
1533
+ " -DLINE_STRIDE_A=5504 "
1534
+ " -DBLOCK_STRIDE_A=44032 "
1535
+ " -DSIMDGROUP_WIDTH=" +
1536
+ std::to_string(backend_ctx->adreno_wave_size);
1537
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1538
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1539
+ }
1540
+
1541
+ backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
1542
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1543
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
1544
+ GGML_LOG_CONT(".");
1545
+
1546
+ // Gemv 16000, 128000
1547
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1548
+ " -cl-mad-enable "
1549
+ " -DLINE_STRIDE_A=16000 "
1550
+ " -DBLOCK_STRIDE_A=128000 "
1551
+ " -DSIMDGROUP_WIDTH=" +
1552
+ std::to_string(backend_ctx->adreno_wave_size);
1553
+
1554
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1555
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1556
+ }
1557
+
1558
+ backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(
1559
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1560
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
1561
+ GGML_LOG_CONT(".");
1562
+ }
1563
+
1564
+ // mul_mat_Ab_Bi_8x4
1565
+ {
1566
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1567
+ const std::string kernel_src_CL_gemm {
1568
+ #include "mul_mat_Ab_Bi_8x4.cl.h"
1569
+ };
1570
+ #else
1571
+ const std::string kernel_src_CL_gemm = read_file("mul_mat_Ab_Bi_8x4.cl");
1572
+ #endif
1573
+ backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_CL_gemm.c_str(), compile_opts);
1574
+ CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
1575
+ GGML_LOG_CONT(".");
1576
+ }
1577
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1578
+ GGML_LOG_CONT("\n");
1579
+ }
1580
+
1581
+ // XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1582
+ // XXX static bool initialized = false;
1583
+ // XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
1584
+
1585
+ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
1586
+
1587
+ namespace /* anonymous */ {
1588
+ extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
1135
1589
  }
1136
1590
 
1137
1591
  // Look for available and suitable devices.
@@ -1381,6 +1835,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1381
1835
  backend_ctx->device = dev_ctx->device;
1382
1836
  backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1383
1837
 
1838
+ // ref_count get increased in ggml_backend_opencl_device_init
1839
+ // This function is also used to retrieve backend context, so we don't want
1840
+ // to increase ref_count for each call. We only want to increase ref_count
1841
+ // when the associated device is initialized
1842
+ backend_ctx->ref_count = 0;
1843
+
1384
1844
  if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
1385
1845
  strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
1386
1846
  strstr(dev_ctx->device_version.c_str(), "Adreno")) {
@@ -1553,93 +2013,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1553
2013
  return dev_ctx->backend_ctx;
1554
2014
  }
1555
2015
 
1556
- static void ggml_cl2_free(void) {
1557
- #ifdef GGML_OPENCL_PROFILING
1558
- FILE * fperf = fopen("cl_profiling.csv", "w");
1559
- if (!fperf) {
1560
- GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
1561
- return;
1562
- }
2016
+ static void ggml_cl2_free(ggml_backend_t backend) {
2017
+ ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
2018
+ ctx->free();
1563
2019
 
1564
- // Populate profiling info
1565
- for (ProfilingInfo & info : g_profiling_info) {
1566
- cl_ulong cmd_queued;
1567
- cl_ulong cmd_submit;
1568
- cl_ulong cmd_start;
1569
- cl_ulong cmd_end;
1570
- cl_ulong cmd_complete;
1571
-
1572
- CL_CHECK(clWaitForEvents(1, &info.evt));
1573
- CL_CHECK(clGetEventProfilingInfo(
1574
- info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
1575
- CL_CHECK(clGetEventProfilingInfo(
1576
- info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
1577
- CL_CHECK(clGetEventProfilingInfo(
1578
- info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
1579
- CL_CHECK(clGetEventProfilingInfo(
1580
- info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
1581
- CL_CHECK(clGetEventProfilingInfo(
1582
- info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
1583
- CL_CHECK(clReleaseEvent(info.evt));
1584
-
1585
- char kernel_name[512];
1586
- CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
1587
- sizeof(kernel_name), kernel_name, NULL));
1588
- info.kernel_name = kernel_name;
1589
-
1590
- info.cmd_queued = cmd_queued;
1591
- info.cmd_submit = cmd_submit;
1592
- info.cmd_start = cmd_start;
1593
- info.cmd_end = cmd_end;
1594
-
1595
- info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
1596
- info.cmd_submit_duration_ns = cmd_start - cmd_submit;
1597
- info.cmd_duration_ns = cmd_end - cmd_start;
1598
- info.cmd_complete_duration_ns = cmd_complete - cmd_end;
1599
- info.cmd_total_duration_ns = cmd_complete - cmd_queued;
1600
- }
1601
-
1602
- // Dump a csv
1603
- float total_kernel_time = 0;
1604
- fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
1605
- for (const ProfilingInfo & info : g_profiling_info) {
1606
- total_kernel_time += info.cmd_duration_ns/1.e6f;
1607
- fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
1608
- info.op_name.c_str(), info.kernel_name.c_str(),
1609
- info.cmd_queued_duration_ns/1.e6f,
1610
- info.cmd_submit_duration_ns/1.e6f,
1611
- info.cmd_duration_ns/1.e6f,
1612
- info.cmd_complete_duration_ns/1.e6f,
1613
- info.cmd_total_duration_ns/1.e6f,
1614
- info.global_size[0], info.global_size[1], info.global_size[2],
1615
- info.local_size[0], info.local_size[1], info.local_size[2],
1616
- info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
1617
- }
1618
- fclose(fperf);
1619
-
1620
- GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
1621
-
1622
- // Dump a simple chrome trace
1623
- FILE* ftrace = fopen("cl_trace.json", "w");
1624
- if (!ftrace) {
1625
- GGML_LOG_ERROR("Failed to open cl_trace.json\n");
1626
- return;
2020
+ // The CL context is shared by all backends, release it if all backends have been released
2021
+ bool should_release_opencl = true;
2022
+ for (auto device : g_ggml_backend_opencl_devices) {
2023
+ ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
2024
+ if (ctx_dev->backend_ctx->ref_count > 0) {
2025
+ should_release_opencl = false;
2026
+ }
1627
2027
  }
1628
2028
 
1629
- fprintf(ftrace, "[\n");
1630
- for (const ProfilingInfo & info : g_profiling_info) {
1631
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
1632
- info.kernel_name.c_str(), info.cmd_queued/1000);
1633
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
1634
- info.kernel_name.c_str(), info.cmd_submit/1000);
1635
-
1636
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1637
- info.kernel_name.c_str(), info.cmd_start/1000);
1638
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1639
- info.kernel_name.c_str(), info.cmd_end/1000);
2029
+ if (should_release_opencl) {
2030
+ CL_CHECK(clReleaseContext(ctx->context));
1640
2031
  }
1641
- fclose(ftrace);
1642
- #endif
1643
2032
  }
1644
2033
 
1645
2034
  //------------------------------------------------------------------------------
@@ -1723,9 +2112,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
1723
2112
  }
1724
2113
 
1725
2114
  static void ggml_backend_opencl_free(ggml_backend_t backend) {
1726
- ggml_cl2_free();
1727
-
1728
- GGML_UNUSED(backend);
2115
+ ggml_cl2_free(backend);
1729
2116
  }
1730
2117
 
1731
2118
  static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -1752,7 +2139,12 @@ static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const g
1752
2139
  }
1753
2140
 
1754
2141
  static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
1755
- GGML_UNUSED(backend);
2142
+ auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
2143
+
2144
+ cl_event evt;
2145
+ CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
2146
+ CL_CHECK(clWaitForEvents(1, &evt));
2147
+ CL_CHECK(clReleaseEvent(evt));
1756
2148
  }
1757
2149
 
1758
2150
  // Syncronizes the 'backend_ctx's device with others so that commands
@@ -1856,6 +2248,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1856
2248
  case GGML_OP_ADD:
1857
2249
  case GGML_OP_SCALE:
1858
2250
  case GGML_OP_MUL:
2251
+ case GGML_OP_DIV:
2252
+ case GGML_OP_SUB:
1859
2253
  return op->src[0]->type == GGML_TYPE_F32;
1860
2254
  case GGML_OP_UNARY:
1861
2255
  switch (ggml_get_unary_op(op)) {
@@ -1864,6 +2258,20 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1864
2258
  case GGML_UNARY_OP_RELU:
1865
2259
  case GGML_UNARY_OP_GELU_QUICK:
1866
2260
  return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
2261
+ case GGML_UNARY_OP_SIGMOID:
2262
+ return ggml_is_contiguous(op->src[0]);
2263
+ case GGML_UNARY_OP_TANH:
2264
+ return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
2265
+ (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
2266
+ default:
2267
+ return false;
2268
+ }
2269
+ case GGML_OP_GLU:
2270
+ switch (ggml_get_glu_op(op)) {
2271
+ case GGML_GLU_OP_GEGLU:
2272
+ case GGML_GLU_OP_REGLU:
2273
+ case GGML_GLU_OP_SWIGLU:
2274
+ return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
1867
2275
  default:
1868
2276
  return false;
1869
2277
  }
@@ -1873,16 +2281,36 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1873
2281
  case GGML_OP_NORM:
1874
2282
  case GGML_OP_RMS_NORM:
1875
2283
  return true;
2284
+ case GGML_OP_REPEAT:
2285
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
2286
+ case GGML_OP_PAD:
2287
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
2288
+ op->src[0]->ne[3] == 1 && op->ne[3] == 1;
2289
+ case GGML_OP_UPSCALE:
2290
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
2291
+ case GGML_OP_CONCAT:
2292
+ return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
2293
+ case GGML_OP_TIMESTEP_EMBEDDING:
2294
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
2295
+ case GGML_OP_GROUP_NORM:
2296
+ return ggml_is_contiguous(op->src[0]);
1876
2297
  case GGML_OP_MUL_MAT:
1877
2298
  if (op->src[0]->type == GGML_TYPE_F16) {
1878
2299
  return true;
1879
2300
  } else if (op->src[0]->type == GGML_TYPE_F32) {
1880
- return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2301
+ return op->src[1]->type == GGML_TYPE_F32;
1881
2302
  } else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
1882
2303
  op->src[0]->type == GGML_TYPE_Q6_K) {
1883
2304
  return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
1884
2305
  }
1885
2306
  return false;
2307
+ case GGML_OP_MUL_MAT_ID:
2308
+ if (op->src[0]->type == GGML_TYPE_Q4_0) {
2309
+ if (op->src[1]->type == GGML_TYPE_F32) {
2310
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2311
+ }
2312
+ }
2313
+ return false;
1886
2314
  case GGML_OP_RESHAPE:
1887
2315
  case GGML_OP_VIEW:
1888
2316
  case GGML_OP_PERMUTE:
@@ -1912,6 +2340,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1912
2340
  }
1913
2341
  case GGML_OP_IM2COL:
1914
2342
  return true;
2343
+ case GGML_OP_ARGSORT:
2344
+ return op->src[0]->type == GGML_TYPE_F32;
2345
+ case GGML_OP_SUM_ROWS:
2346
+ return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
1915
2347
  default:
1916
2348
  return false;
1917
2349
  }
@@ -1931,7 +2363,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
1931
2363
  /* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
1932
2364
  /* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
1933
2365
  /* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
1934
- /* .synchronize = */ NULL, /* ggml_backend_opencl_synchronize */
2366
+ /* .synchronize = */ ggml_backend_opencl_synchronize,
1935
2367
  /* .graph_plan_create = */ NULL,
1936
2368
  /* .graph_plan_free = */ NULL,
1937
2369
  /* .graph_plan_update = */ NULL,
@@ -2575,6 +3007,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
2575
3007
 
2576
3008
  static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
2577
3009
  ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
3010
+ // Getting a new reference to the backend, increase ref_count
3011
+ backend_ctx->ref_count++;
2578
3012
 
2579
3013
  ggml_backend_t backend = new ggml_backend {
2580
3014
  /* .guid = */ ggml_backend_opencl_guid(),
@@ -2835,31 +3269,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
2835
3269
  #define dump_tensor(tensor)
2836
3270
  #endif
2837
3271
 
2838
- //------------------------------------------------------------------------------
2839
- // Profiling utility
2840
- //------------------------------------------------------------------------------
2841
- #ifdef GGML_OPENCL_PROFILING
2842
- static void populateProfilingInfo(
2843
- ProfilingInfo& info, cl_event evt, cl_kernel kernel,
2844
- size_t global_size[3], size_t local_size[3],
2845
- const ggml_tensor * tensor) {
2846
- info.op_name = tensor->name;
2847
- info.kernel = kernel;
2848
- info.evt = evt;
2849
-
2850
- info.local_size[0] = local_size[0];
2851
- info.local_size[1] = local_size[1];
2852
- info.local_size[2] = local_size[2];
2853
- info.global_size[0] = global_size[0];
2854
- info.global_size[1] = global_size[1];
2855
- info.global_size[2] = global_size[2];
2856
- info.output_size[0] = tensor->ne[0];
2857
- info.output_size[1] = tensor->ne[1];
2858
- info.output_size[2] = tensor->ne[2];
2859
- info.output_size[3] = tensor->ne[3];
2860
- }
2861
- #endif
2862
-
2863
3272
  //------------------------------------------------------------------------------
2864
3273
  // Ops
2865
3274
  //------------------------------------------------------------------------------
@@ -2903,7 +3312,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
2903
3312
  const cl_ulong nb2 = dst ? dst->nb[2] : 0;
2904
3313
 
2905
3314
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2906
- cl_command_queue queue = backend_ctx->queue;
2907
3315
 
2908
3316
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2909
3317
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -2947,16 +3355,8 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
2947
3355
  size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
2948
3356
  size_t local_work_size[] = {1, 1, 1};
2949
3357
 
2950
- #ifdef GGML_OPENCL_PROFILING
2951
- cl_event evt;
2952
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2953
-
2954
- g_profiling_info.emplace_back();
2955
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2956
- #else
2957
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2958
- #endif
2959
- }
3358
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3359
+ }
2960
3360
 
2961
3361
  static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2962
3362
  GGML_ASSERT(src0);
@@ -2997,7 +3397,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
2997
3397
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
2998
3398
 
2999
3399
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3000
- cl_command_queue queue = backend_ctx->queue;
3001
3400
 
3002
3401
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3003
3402
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3072,29 +3471,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3072
3471
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3073
3472
  }
3074
3473
 
3075
- #ifdef GGML_OPENCL_PROFILING
3076
- cl_event evt;
3077
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3078
-
3079
- g_profiling_info.emplace_back();
3080
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3081
- #else
3082
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3083
- #endif
3474
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3084
3475
  } else {
3085
3476
  unsigned int nth = MIN(64, ne0);
3086
3477
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3087
3478
  size_t local_work_size[] = {nth, 1, 1};
3088
3479
 
3089
- #ifdef GGML_OPENCL_PROFILING
3090
- cl_event evt;
3091
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3092
-
3093
- g_profiling_info.emplace_back();
3094
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3095
- #else
3096
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3097
- #endif
3480
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3098
3481
  }
3099
3482
  }
3100
3483
 
@@ -3137,7 +3520,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3137
3520
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3138
3521
 
3139
3522
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3140
- cl_command_queue queue = backend_ctx->queue;
3141
3523
 
3142
3524
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3143
3525
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3212,29 +3594,229 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3212
3594
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3213
3595
  }
3214
3596
 
3215
- #ifdef GGML_OPENCL_PROFILING
3216
- cl_event evt;
3217
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3597
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3598
+ } else {
3599
+ unsigned int nth = MIN(64, ne0);
3600
+ size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3601
+ size_t local_work_size[] = {nth, 1, 1};
3218
3602
 
3219
- g_profiling_info.emplace_back();
3220
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3221
- #else
3222
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3223
- #endif
3603
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3604
+ }
3605
+ }
3606
+
3607
+ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3608
+ GGML_ASSERT(src0);
3609
+ GGML_ASSERT(src0->extra);
3610
+ GGML_ASSERT(src1);
3611
+ GGML_ASSERT(src1->extra);
3612
+ GGML_ASSERT(dst);
3613
+ GGML_ASSERT(dst->extra);
3614
+
3615
+ const int ne00 = src0->ne[0];
3616
+ const int ne01 = src0->ne[1];
3617
+ const int ne02 = src0->ne[2];
3618
+ const int ne03 = src0->ne[3];
3619
+
3620
+ const cl_ulong nb00 = src0->nb[0];
3621
+ const cl_ulong nb01 = src0->nb[1];
3622
+ const cl_ulong nb02 = src0->nb[2];
3623
+ const cl_ulong nb03 = src0->nb[3];
3624
+
3625
+ const int ne10 = src1->ne[0];
3626
+ const int ne11 = src1->ne[1];
3627
+ const int ne12 = src1->ne[2];
3628
+ const int ne13 = src1->ne[3];
3629
+
3630
+ const cl_ulong nb10 = src1->nb[0];
3631
+ const cl_ulong nb11 = src1->nb[1];
3632
+ const cl_ulong nb12 = src1->nb[2];
3633
+ const cl_ulong nb13 = src1->nb[3];
3634
+
3635
+ const int ne0 = dst->ne[0];
3636
+
3637
+ const cl_ulong nb0 = dst->nb[0];
3638
+ const cl_ulong nb1 = dst->nb[1];
3639
+ const cl_ulong nb2 = dst->nb[2];
3640
+ const cl_ulong nb3 = dst->nb[3];
3641
+
3642
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3643
+
3644
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3645
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3646
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3647
+
3648
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3649
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
3650
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3651
+
3652
+ bool bcast_row = false;
3653
+ cl_kernel kernel;
3654
+
3655
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
3656
+ GGML_ASSERT(ggml_is_contiguous(src0));
3657
+
3658
+ // src1 is a row
3659
+ GGML_ASSERT(ne11 == 1);
3660
+
3661
+ bcast_row = true;
3662
+ int ne = ne00 / 4;
3663
+ kernel = backend_ctx->kernel_div_row;
3664
+
3665
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3666
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3667
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3668
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3669
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3670
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3671
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
3672
+ } else {
3673
+ kernel = backend_ctx->kernel_div;
3674
+
3675
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3676
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3677
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3678
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3679
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3680
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3681
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
3682
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3683
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3684
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
3685
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
3686
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
3687
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
3688
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
3689
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
3690
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
3691
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
3692
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
3693
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
3694
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
3695
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
3696
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
3697
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
3698
+ }
3699
+
3700
+ if (bcast_row) {
3701
+ int n = ggml_nelements(dst)/4;
3702
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3703
+ size_t local_work_size[] = {64, 1, 1};
3704
+
3705
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3224
3706
  } else {
3225
3707
  unsigned int nth = MIN(64, ne0);
3226
3708
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3227
3709
  size_t local_work_size[] = {nth, 1, 1};
3228
3710
 
3229
- #ifdef GGML_OPENCL_PROFILING
3230
- cl_event evt;
3231
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3711
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3712
+ }
3713
+ }
3232
3714
 
3233
- g_profiling_info.emplace_back();
3234
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3235
- #else
3236
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3237
- #endif
3715
+ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3716
+ GGML_ASSERT(src0);
3717
+ GGML_ASSERT(src0->extra);
3718
+ GGML_ASSERT(src1);
3719
+ GGML_ASSERT(src1->extra);
3720
+ GGML_ASSERT(dst);
3721
+ GGML_ASSERT(dst->extra);
3722
+
3723
+ const int ne00 = src0->ne[0];
3724
+ const int ne01 = src0->ne[1];
3725
+ const int ne02 = src0->ne[2];
3726
+ const int ne03 = src0->ne[3];
3727
+
3728
+ const cl_ulong nb00 = src0->nb[0];
3729
+ const cl_ulong nb01 = src0->nb[1];
3730
+ const cl_ulong nb02 = src0->nb[2];
3731
+ const cl_ulong nb03 = src0->nb[3];
3732
+
3733
+ const int ne10 = src1->ne[0];
3734
+ const int ne11 = src1->ne[1];
3735
+ const int ne12 = src1->ne[2];
3736
+ const int ne13 = src1->ne[3];
3737
+
3738
+ const cl_ulong nb10 = src1->nb[0];
3739
+ const cl_ulong nb11 = src1->nb[1];
3740
+ const cl_ulong nb12 = src1->nb[2];
3741
+ const cl_ulong nb13 = src1->nb[3];
3742
+
3743
+ const int ne0 = dst->ne[0];
3744
+
3745
+ const cl_ulong nb0 = dst->nb[0];
3746
+ const cl_ulong nb1 = dst->nb[1];
3747
+ const cl_ulong nb2 = dst->nb[2];
3748
+ const cl_ulong nb3 = dst->nb[3];
3749
+
3750
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3751
+
3752
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3753
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3754
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3755
+
3756
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3757
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
3758
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3759
+
3760
+ bool bcast_row = false;
3761
+ cl_kernel kernel;
3762
+
3763
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
3764
+ GGML_ASSERT(ggml_is_contiguous(src0));
3765
+
3766
+ // src1 is a row
3767
+ GGML_ASSERT(ne11 == 1);
3768
+
3769
+ bcast_row = true;
3770
+ int ne = ne00 / 4;
3771
+ kernel = backend_ctx->kernel_sub_row;
3772
+
3773
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3774
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3775
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3776
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3777
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3778
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3779
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
3780
+ } else {
3781
+ kernel = backend_ctx->kernel_sub;
3782
+
3783
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3784
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3785
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3786
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3787
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3788
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3789
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
3790
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3791
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3792
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
3793
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
3794
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
3795
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
3796
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
3797
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
3798
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
3799
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
3800
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
3801
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
3802
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
3803
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
3804
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
3805
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
3806
+ }
3807
+
3808
+ if (bcast_row) {
3809
+ int n = ggml_nelements(dst)/4;
3810
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3811
+ size_t local_work_size[] = {64, 1, 1};
3812
+
3813
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3814
+ } else {
3815
+ unsigned int nth = MIN(64, ne0);
3816
+ size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3817
+ size_t local_work_size[] = {nth, 1, 1};
3818
+
3819
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3238
3820
  }
3239
3821
  }
3240
3822
 
@@ -3247,7 +3829,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3247
3829
  UNUSED(src1);
3248
3830
 
3249
3831
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3250
- cl_command_queue queue = backend_ctx->queue;
3251
3832
 
3252
3833
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3253
3834
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3274,15 +3855,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3274
3855
  size_t global_work_size[] = {(size_t)n, 1, 1};
3275
3856
  size_t local_work_size[] = {64, 1, 1};
3276
3857
 
3277
- #ifdef GGML_OPENCL_PROFILING
3278
- cl_event evt;
3279
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3280
-
3281
- g_profiling_info.emplace_back();
3282
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3283
- #else
3284
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3285
- #endif
3858
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3286
3859
  }
3287
3860
 
3288
3861
  static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3294,7 +3867,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
3294
3867
  UNUSED(src1);
3295
3868
 
3296
3869
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3297
- cl_command_queue queue = backend_ctx->queue;
3298
3870
 
3299
3871
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3300
3872
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3321,15 +3893,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
3321
3893
  size_t global_work_size[] = {(size_t)n, 1, 1};
3322
3894
  size_t local_work_size[] = {64, 1, 1};
3323
3895
 
3324
- #ifdef GGML_OPENCL_PROFILING
3325
- cl_event evt;
3326
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3327
-
3328
- g_profiling_info.emplace_back();
3329
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3330
- #else
3331
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3332
- #endif
3896
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3333
3897
  }
3334
3898
 
3335
3899
  static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3341,7 +3905,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3341
3905
  UNUSED(src1);
3342
3906
 
3343
3907
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3344
- cl_command_queue queue = backend_ctx->queue;
3345
3908
 
3346
3909
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3347
3910
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3373,15 +3936,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3373
3936
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3374
3937
  }
3375
3938
 
3376
- #ifdef GGML_OPENCL_PROFILING
3377
- cl_event evt;
3378
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3379
-
3380
- g_profiling_info.emplace_back();
3381
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3382
- #else
3383
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3384
- #endif
3939
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3385
3940
  }
3386
3941
 
3387
3942
  static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3393,7 +3948,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3393
3948
  UNUSED(src1);
3394
3949
 
3395
3950
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3396
- cl_command_queue queue = backend_ctx->queue;
3397
3951
 
3398
3952
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3399
3953
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3418,15 +3972,50 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3418
3972
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3419
3973
  }
3420
3974
 
3421
- #ifdef GGML_OPENCL_PROFILING
3422
- cl_event evt;
3423
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3975
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3976
+ }
3424
3977
 
3425
- g_profiling_info.emplace_back();
3426
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3427
- #else
3428
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3429
- #endif
3978
+ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3979
+ GGML_ASSERT(src0);
3980
+ GGML_ASSERT(src0->extra);
3981
+ GGML_ASSERT(dst);
3982
+ GGML_ASSERT(dst->extra);
3983
+
3984
+ UNUSED(src1);
3985
+
3986
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3987
+
3988
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3989
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3990
+
3991
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3992
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3993
+
3994
+ cl_kernel kernel;
3995
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3996
+ kernel = backend_ctx->kernel_sigmoid_f32;
3997
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3998
+ kernel = backend_ctx->kernel_sigmoid_f16;
3999
+ } else {
4000
+ GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
4001
+ }
4002
+
4003
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
4004
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
4005
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4006
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
4007
+
4008
+ const int64_t n = ggml_nelements(dst);
4009
+
4010
+ size_t global_work_size[] = {(size_t)n, 1, 1};
4011
+ size_t local_work_size[] = {64, 1, 1};
4012
+
4013
+ size_t * local_work_size_ptr = local_work_size;
4014
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
4015
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4016
+ }
4017
+
4018
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3430
4019
  }
3431
4020
 
3432
4021
  static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3438,7 +4027,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
3438
4027
  UNUSED(src1);
3439
4028
 
3440
4029
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3441
- cl_command_queue queue = backend_ctx->queue;
3442
4030
 
3443
4031
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3444
4032
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3470,15 +4058,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
3470
4058
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3471
4059
  }
3472
4060
 
3473
- #ifdef GGML_OPENCL_PROFILING
3474
- cl_event evt;
3475
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3476
-
3477
- g_profiling_info.emplace_back();
3478
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3479
- #else
3480
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3481
- #endif
4061
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3482
4062
  }
3483
4063
 
3484
4064
  static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3490,7 +4070,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
3490
4070
  UNUSED(src1);
3491
4071
 
3492
4072
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3493
- cl_command_queue queue = backend_ctx->queue;
3494
4073
 
3495
4074
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3496
4075
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3531,15 +4110,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
3531
4110
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
3532
4111
  size_t local_work_size[] = {(size_t)nth, 1, 1};
3533
4112
 
3534
- #ifdef GGML_OPENCL_PROFILING
3535
- cl_event evt;
3536
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3537
-
3538
- g_profiling_info.emplace_back();
3539
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3540
- #else
3541
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3542
- #endif
4113
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3543
4114
  }
3544
4115
 
3545
4116
  static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3551,7 +4122,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
3551
4122
  UNUSED(src1);
3552
4123
 
3553
4124
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3554
- cl_command_queue queue = backend_ctx->queue;
3555
4125
 
3556
4126
  //ggml_backend_opencl_device_context * dev_ctx =
3557
4127
  // (ggml_backend_opencl_device_context *)backend->device->context;
@@ -3615,15 +4185,537 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
3615
4185
  // This is local memory - the size depends on subgroup size.
3616
4186
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
3617
4187
 
3618
- #ifdef GGML_OPENCL_PROFILING
3619
- cl_event evt;
3620
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4188
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4189
+ }
3621
4190
 
3622
- g_profiling_info.emplace_back();
3623
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3624
- #else
3625
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3626
- #endif
4191
+ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4192
+ GGML_ASSERT(src0);
4193
+ GGML_ASSERT(src0->extra);
4194
+ GGML_ASSERT(dst);
4195
+ GGML_ASSERT(dst->extra);
4196
+
4197
+ UNUSED(src1);
4198
+
4199
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4200
+
4201
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4202
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4203
+
4204
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
4205
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
4206
+
4207
+ int32_t n_groups = ((const int32_t *) dst->op_params)[0];
4208
+ int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
4209
+ float eps = ((const float *) dst->op_params)[1];
4210
+
4211
+ const int ne00 = src0->ne[0];
4212
+ const int ne01 = src0->ne[1];
4213
+ const int ne02 = src0->ne[2];
4214
+ const int ne = ne00*ne01*ne02;
4215
+
4216
+ cl_kernel kernel = backend_ctx->kernel_group_norm;
4217
+
4218
+ size_t sgs = 64;
4219
+ if (backend_ctx->gpu_family == ADRENO) {
4220
+ sgs = 64;
4221
+ } else if (backend_ctx->gpu_family == INTEL) {
4222
+ sgs = 32;
4223
+ } else {
4224
+ GGML_ASSERT(false && "Unsupported GPU");
4225
+ }
4226
+
4227
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
4228
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
4229
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4230
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
4231
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
4232
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
4233
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
4234
+
4235
+ size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
4236
+ size_t local_work_size[] = {(size_t)sgs, 1, 1};
4237
+
4238
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4239
+ }
4240
+
4241
+ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4242
+ GGML_ASSERT(src0);
4243
+ GGML_ASSERT(src0->extra);
4244
+ GGML_ASSERT(dst);
4245
+ GGML_ASSERT(dst->extra);
4246
+
4247
+ UNUSED(src1);
4248
+
4249
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4250
+
4251
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4252
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4253
+
4254
+ cl_ulong offset0_abs = extra0->offset + src0->view_offs;
4255
+ cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
4256
+
4257
+ cl_kernel kernel;
4258
+ if (dst->type == GGML_TYPE_F32) {
4259
+ kernel = backend_ctx->kernel_tanh_f32_nd;
4260
+ } else if (dst->type == GGML_TYPE_F16) {
4261
+ kernel = backend_ctx->kernel_tanh_f16_nd;
4262
+ } else {
4263
+ GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
4264
+ }
4265
+ GGML_ASSERT(kernel != nullptr);
4266
+
4267
+ const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
4268
+ const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
4269
+
4270
+ const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
4271
+ const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
4272
+
4273
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
4274
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
4275
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4276
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
4277
+
4278
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
4279
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
4280
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
4281
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
4282
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
4283
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
4284
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
4285
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
4286
+
4287
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
4288
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
4289
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
4290
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
4291
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
4292
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
4293
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
4294
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
4295
+
4296
+ size_t global_work_size[3];
4297
+ if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
4298
+ return;
4299
+ }
4300
+ global_work_size[0] = (size_t)ne10;
4301
+ global_work_size[1] = (size_t)ne11;
4302
+ global_work_size[2] = (size_t)ne12;
4303
+
4304
+ size_t lws0 = 16, lws1 = 4, lws2 = 1;
4305
+ if (ne10 < 16) lws0 = ne10;
4306
+ if (ne11 < 4) lws1 = ne11;
4307
+ if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
4308
+
4309
+ while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
4310
+ while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
4311
+ while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
4312
+
4313
+
4314
+ size_t local_work_size[] = {lws0, lws1, lws2};
4315
+
4316
+ size_t* local_work_size_ptr = local_work_size;
4317
+ if (!backend_ctx->non_uniform_workgroups) {
4318
+ if (global_work_size[0] % local_work_size[0] != 0 ||
4319
+ global_work_size[1] % local_work_size[1] != 0 ||
4320
+ global_work_size[2] % local_work_size[2] != 0) {
4321
+ local_work_size_ptr = NULL;
4322
+ }
4323
+ }
4324
+ if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
4325
+
4326
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4327
+ }
4328
+
4329
+ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
4330
+ GGML_ASSERT(src0);
4331
+ GGML_ASSERT(src0->extra);
4332
+ GGML_ASSERT(dst);
4333
+ GGML_ASSERT(dst->extra);
4334
+ GGML_ASSERT(dst->type == src0->type);
4335
+
4336
+ UNUSED(src1_shape_def);
4337
+
4338
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4339
+
4340
+ if (backend_ctx->kernel_repeat == nullptr) {
4341
+ GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
4342
+ return;
4343
+ }
4344
+
4345
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4346
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4347
+
4348
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4349
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4350
+
4351
+ const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
4352
+ const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
4353
+
4354
+ const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
4355
+ const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
4356
+
4357
+ cl_kernel kernel = backend_ctx->kernel_repeat;
4358
+
4359
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4360
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device));
4361
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
4362
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4363
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
4364
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
4365
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
4366
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
4367
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
4368
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
4369
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
4370
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
4371
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
4372
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
4373
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
4374
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
4375
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
4376
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
4377
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
4378
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
4379
+
4380
+ size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
4381
+ size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
4382
+ size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
4383
+
4384
+ size_t global_work_size[] = { gws0, gws1, gws2 };
4385
+
4386
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4387
+ }
4388
+
4389
+ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
4390
+ GGML_ASSERT(src0);
4391
+ GGML_ASSERT(src0->extra);
4392
+ GGML_ASSERT(dst);
4393
+ GGML_ASSERT(dst->extra);
4394
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4395
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4396
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
4397
+
4398
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4399
+
4400
+ if (backend_ctx->kernel_pad == nullptr) {
4401
+ GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
4402
+ return;
4403
+ }
4404
+
4405
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4406
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4407
+
4408
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4409
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4410
+
4411
+ const int s_ne0 = src0->ne[0];
4412
+ const int s_ne1 = src0->ne[1];
4413
+ const int s_ne2 = src0->ne[2];
4414
+
4415
+ const int d_ne0 = dst->ne[0];
4416
+ const int d_ne1 = dst->ne[1];
4417
+ const int d_ne2 = dst->ne[2];
4418
+
4419
+ cl_kernel kernel = backend_ctx->kernel_pad;
4420
+
4421
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4422
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4423
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
4424
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4425
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
4426
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
4427
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
4428
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne0));
4429
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne1));
4430
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne2));
4431
+
4432
+ size_t lws0 = 64;
4433
+ size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
4434
+
4435
+ size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
4436
+ size_t local_work_size[] = { lws0, 1, 1 };
4437
+
4438
+ size_t * local_work_size_ptr = local_work_size;
4439
+ if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
4440
+ local_work_size_ptr = nullptr;
4441
+ }
4442
+
4443
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4444
+ }
4445
+
4446
+ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
4447
+ GGML_ASSERT(src0);
4448
+ GGML_ASSERT(src0->extra);
4449
+ GGML_ASSERT(dst);
4450
+ GGML_ASSERT(dst->extra);
4451
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4452
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4453
+
4454
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4455
+
4456
+ const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
4457
+ cl_kernel kernel = nullptr;
4458
+
4459
+ if (mode == GGML_SCALE_MODE_NEAREST) {
4460
+ kernel = backend_ctx->kernel_upscale;
4461
+ if (kernel == nullptr) {
4462
+ GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__);
4463
+ return;
4464
+ }
4465
+ } else if (mode == GGML_SCALE_MODE_BILINEAR) {
4466
+ kernel = backend_ctx->kernel_upscale_bilinear;
4467
+ if (kernel == nullptr) {
4468
+ GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__);
4469
+ return;
4470
+ }
4471
+ } else {
4472
+ GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode);
4473
+ return;
4474
+ }
4475
+
4476
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4477
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4478
+
4479
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4480
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4481
+
4482
+ const cl_ulong nb00 = src0->nb[0];
4483
+ const cl_ulong nb01 = src0->nb[1];
4484
+ const cl_ulong nb02 = src0->nb[2];
4485
+ const cl_ulong nb03 = src0->nb[3];
4486
+
4487
+ const int ne00_src = src0->ne[0];
4488
+ const int ne01_src = src0->ne[1];
4489
+
4490
+ const int ne10_dst = dst->ne[0];
4491
+ const int ne11_dst = dst->ne[1];
4492
+ const int ne12_dst = dst->ne[2];
4493
+ const int ne13_dst = dst->ne[3];
4494
+
4495
+ const float sf0 = (float)dst->ne[0] / src0->ne[0];
4496
+ const float sf1 = (float)dst->ne[1] / src0->ne[1];
4497
+ const float sf2 = (float)dst->ne[2] / src0->ne[2];
4498
+ const float sf3 = (float)dst->ne[3] / src0->ne[3];
4499
+
4500
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4501
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4502
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
4503
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4504
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb00));
4505
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
4506
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb02));
4507
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
4508
+
4509
+ if (mode == GGML_SCALE_MODE_NEAREST) {
4510
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst));
4511
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst));
4512
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst));
4513
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst));
4514
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
4515
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
4516
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
4517
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
4518
+ } else if (mode == GGML_SCALE_MODE_BILINEAR) {
4519
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src));
4520
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src));
4521
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst));
4522
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst));
4523
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst));
4524
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst));
4525
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
4526
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
4527
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
4528
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
4529
+ }
4530
+
4531
+
4532
+ size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
4533
+ if (dst_total_elements == 0) {
4534
+ return;
4535
+ }
4536
+ size_t global_work_size[] = { dst_total_elements, 1, 1 };
4537
+ size_t local_work_size_pref = 256;
4538
+ size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1};
4539
+
4540
+ size_t * local_work_size_ptr = local_work_size;
4541
+ if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) {
4542
+ local_work_size_ptr = nullptr;
4543
+ }
4544
+
4545
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4546
+ }
4547
+
4548
+ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4549
+ GGML_ASSERT(src0);
4550
+ GGML_ASSERT(src0->extra);
4551
+ GGML_ASSERT(src1);
4552
+ GGML_ASSERT(src1->extra);
4553
+ GGML_ASSERT(dst);
4554
+ GGML_ASSERT(dst->extra);
4555
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4556
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
4557
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4558
+
4559
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4560
+ cl_command_queue queue = backend_ctx->queue;
4561
+
4562
+ if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
4563
+ GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
4564
+ return;
4565
+ }
4566
+
4567
+ ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
4568
+ ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
4569
+ ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
4570
+
4571
+ cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
4572
+ cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
4573
+ cl_ulong off_dst = extrad_cl->offset + dst->view_offs;
4574
+
4575
+ const int32_t dim = ((const int32_t *) dst->op_params)[0];
4576
+ GGML_ASSERT(dim >= 0 && dim <= 3);
4577
+
4578
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
4579
+ if (dim == 3) {
4580
+
4581
+ size_t nbytes_src0 = ggml_nbytes(src0);
4582
+ size_t nbytes_src1 = ggml_nbytes(src1);
4583
+
4584
+ CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
4585
+ off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
4586
+ CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
4587
+ off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
4588
+ } else {
4589
+
4590
+ cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
4591
+ size_t global_work_size[3];
4592
+
4593
+ for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
4594
+ cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
4595
+ cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
4596
+ cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
4597
+
4598
+ int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
4599
+ int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
4600
+ int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
4601
+
4602
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
4603
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &current_off_src0));
4604
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
4605
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &current_off_src1));
4606
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
4607
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &current_off_dst));
4608
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
4609
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
4610
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
4611
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
4612
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
4613
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
4614
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
4615
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
4616
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
4617
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
4618
+
4619
+ global_work_size[0] = d_ne0;
4620
+ global_work_size[1] = d_ne1;
4621
+ global_work_size[2] = d_ne2;
4622
+
4623
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4624
+ }
4625
+ }
4626
+ } else {
4627
+ cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
4628
+
4629
+ long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
4630
+ cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
4631
+
4632
+ cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
4633
+
4634
+ long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
4635
+ cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
4636
+
4637
+
4638
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
4639
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4640
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
4641
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
4642
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
4643
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
4644
+
4645
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long), &ne00));
4646
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long), &ne01));
4647
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long), &ne02));
4648
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long), &ne03));
4649
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
4650
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
4651
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
4652
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
4653
+
4654
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
4655
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
4656
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
4657
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
4658
+
4659
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long), &d_ne0));
4660
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long), &d_ne1));
4661
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long), &d_ne2));
4662
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long), &d_ne3));
4663
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
4664
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
4665
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
4666
+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
4667
+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
4668
+
4669
+ size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
4670
+ d_ne2 > 0 ? (size_t)d_ne2 : 1,
4671
+ d_ne3 > 0 ? (size_t)d_ne3 : 1 };
4672
+
4673
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
4674
+ }
4675
+ }
4676
+
4677
+ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
4678
+ GGML_ASSERT(src0);
4679
+ GGML_ASSERT(src0->extra);
4680
+ GGML_ASSERT(dst);
4681
+ GGML_ASSERT(dst->extra);
4682
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4683
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4684
+
4685
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4686
+
4687
+ if (backend_ctx->kernel_timestep_embedding == nullptr) {
4688
+ GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
4689
+ return;
4690
+ }
4691
+
4692
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4693
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4694
+
4695
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4696
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4697
+
4698
+ const int logical_dim = dst->op_params[0];
4699
+ const int max_period = dst->op_params[1];
4700
+ const int dst_nb1_bytes = dst->nb[1];
4701
+
4702
+ cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
4703
+
4704
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4705
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4706
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
4707
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4708
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes));
4709
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim));
4710
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period));
4711
+
4712
+ size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
4713
+
4714
+ size_t gws1 = (size_t)src0->ne[0];
4715
+
4716
+ size_t global_work_size[] = {gws0, gws1, 1};
4717
+
4718
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
3627
4719
  }
3628
4720
 
3629
4721
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3638,7 +4730,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3638
4730
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
3639
4731
 
3640
4732
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3641
- cl_command_queue queue = backend_ctx->queue;
3642
4733
 
3643
4734
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3644
4735
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3843,15 +4934,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3843
4934
  static_cast<size_t>(padded_height_B)
3844
4935
  };
3845
4936
 
3846
- #ifdef GGML_OPENCL_PROFILING
3847
- cl_event evt;
3848
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
3849
-
3850
- g_profiling_info.emplace_back();
3851
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
3852
- #else
3853
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
3854
- #endif
4937
+ backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
3855
4938
  } else {
3856
4939
  // no need to transpose B in other cases
3857
4940
  // create an image for B from sub_buffer
@@ -3973,16 +5056,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3973
5056
 
3974
5057
  // enqueue kernel with profiling
3975
5058
  // <--------------------------------------------> //
3976
- #ifdef GGML_OPENCL_PROFILING
3977
- cl_event evt;
3978
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3979
-
3980
- g_profiling_info.emplace_back();
3981
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3982
- // enqueue kernel without profiling
3983
- #else
3984
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3985
- #endif
5059
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3986
5060
  // <--------------------------------------------> //
3987
5061
 
3988
5062
  // deallocate sub buffers and images
@@ -4062,15 +5136,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4062
5136
  global_work_size[2] = (size_t)ne12*ne13;
4063
5137
  }
4064
5138
 
4065
- #ifdef GGML_OPENCL_PROFILING
4066
- cl_event evt;
4067
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4068
-
4069
- g_profiling_info.emplace_back();
4070
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4071
- #else
4072
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4073
- #endif
5139
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4074
5140
  return;
4075
5141
  }
4076
5142
  #else // GGML_OPENCL_SOA_Q
@@ -4300,15 +5366,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4300
5366
  size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
4301
5367
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
4302
5368
 
4303
- #ifdef GGML_OPENCL_PROFILING
4304
- cl_event evt;
4305
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4306
-
4307
- g_profiling_info.emplace_back();
4308
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4309
- #else
4310
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4311
- #endif
5369
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4312
5370
  } else if (src0t == GGML_TYPE_Q4_K) {
4313
5371
  GGML_ASSERT(false && "not implemented");
4314
5372
  } else if (src0t == GGML_TYPE_Q3_K) {
@@ -4317,33 +5375,138 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4317
5375
  GGML_ASSERT(false && "not implemented");
4318
5376
  } else if (src0t == GGML_TYPE_Q6_K) {
4319
5377
  size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
4320
- size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
4321
-
4322
- #ifdef GGML_OPENCL_PROFILING
4323
- cl_event evt;
4324
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5378
+ size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
4325
5379
 
4326
- g_profiling_info.emplace_back();
4327
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4328
- #else
4329
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4330
- #endif
5380
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4331
5381
  } else {
4332
5382
  int64_t ny = (ne11 + nrows - 1)/nrows;
4333
5383
 
4334
5384
  size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
4335
5385
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
4336
5386
 
4337
- #ifdef GGML_OPENCL_PROFILING
4338
- cl_event evt;
4339
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5387
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5388
+ }
5389
+ }
4340
5390
 
4341
- g_profiling_info.emplace_back();
4342
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4343
- #else
4344
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5391
+ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5392
+ GGML_ASSERT(src0);
5393
+ GGML_ASSERT(src0->extra);
5394
+ GGML_ASSERT(src1);
5395
+ GGML_ASSERT(src1->extra);
5396
+ GGML_ASSERT(dst);
5397
+ GGML_ASSERT(dst->extra);
5398
+
5399
+ const ggml_tensor * src2 = dst->src[2];
5400
+ GGML_ASSERT(src2);
5401
+ GGML_ASSERT(src2->extra);
5402
+
5403
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5404
+
5405
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5406
+ ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
5407
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5408
+
5409
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
5410
+ cl_ulong offset2 = extra2->offset + src2->view_offs;
5411
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
5412
+
5413
+ #ifdef GGML_OPENCL_SOA_Q
5414
+ ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
4345
5415
  #endif
5416
+
5417
+ const int ne00 = src0->ne[0];
5418
+ const int ne01 = src0->ne[1];
5419
+ const int ne02 = src0->ne[2];
5420
+ const int ne03 = src0->ne[3];
5421
+
5422
+ const cl_ulong nb00 = src0->nb[0];
5423
+ const cl_ulong nb02 = src0->nb[2];
5424
+
5425
+ const int ne10 = src1->ne[0];
5426
+ const int ne11 = src1->ne[1];
5427
+ const int ne12 = src1->ne[2];
5428
+ const int ne13 = src1->ne[3];
5429
+
5430
+ const cl_ulong nb11 = src1->nb[1];
5431
+ const cl_ulong nb12 = src1->nb[2];
5432
+
5433
+ const int ne20 = src2->ne[0];
5434
+ const int ne21 = src2->ne[1];
5435
+
5436
+ const cl_ulong nb21 = src2->nb[1];
5437
+
5438
+ const int ne0 = dst->ne[0];
5439
+ const int ne1 = dst->ne[1];
5440
+
5441
+ const int r2 = ne12/ne02;
5442
+ const int r3 = ne13/ne03;
5443
+ const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
5444
+
5445
+ GGML_ASSERT(ne00 == ne10);
5446
+
5447
+ int sgs = 32; // subgroup size
5448
+ int nsg = 1; // number of subgroups
5449
+ int nrows = 1; // number of row in src1
5450
+ int ndst = 4; // number of values produced by each subgroup
5451
+
5452
+ cl_kernel kernel;
5453
+
5454
+ // subgroup mat vec
5455
+ switch (src0->type) {
5456
+ case GGML_TYPE_Q4_0: {
5457
+ kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
5458
+
5459
+ if (backend_ctx->gpu_family == INTEL) {
5460
+ sgs = 16;
5461
+ nsg = 1;
5462
+ ndst = 8;
5463
+ } else if (backend_ctx->gpu_family == ADRENO) {
5464
+ sgs = 64;
5465
+ nsg = 1;
5466
+ ndst = 8;
5467
+ } else {
5468
+ GGML_ASSERT(false && "TODO: Unknown GPU");
5469
+ }
5470
+
5471
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
5472
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
5473
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5474
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5475
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
5476
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
5477
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
5478
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
5479
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
5480
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
5481
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
5482
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
5483
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
5484
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
5485
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
5486
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
5487
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
5488
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
5489
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
5490
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
5491
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
5492
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
5493
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
5494
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
5495
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
5496
+
5497
+ break;
5498
+ }
5499
+ default:
5500
+ GGML_ASSERT(false && "not implemented");;
4346
5501
  }
5502
+
5503
+ int _ne1 = 1;
5504
+ int ne123 = dst_rows;
5505
+
5506
+ size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
5507
+ size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
5508
+
5509
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4347
5510
  }
4348
5511
 
4349
5512
  static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4356,7 +5519,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
4356
5519
  GGML_ASSERT(ggml_is_contiguous(src0));
4357
5520
 
4358
5521
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4359
- cl_command_queue queue = backend_ctx->queue;
4360
5522
 
4361
5523
  float scale;
4362
5524
  memcpy(&scale, dst->op_params, sizeof(scale));
@@ -4385,15 +5547,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
4385
5547
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4386
5548
  }
4387
5549
 
4388
- #ifdef GGML_OPENCL_PROFILING
4389
- cl_event evt;
4390
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4391
-
4392
- g_profiling_info.emplace_back();
4393
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4394
- #else
4395
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4396
- #endif
5550
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4397
5551
  }
4398
5552
 
4399
5553
  static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4430,7 +5584,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
4430
5584
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
4431
5585
 
4432
5586
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4433
- cl_command_queue queue = backend_ctx->queue;
4434
5587
 
4435
5588
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4436
5589
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -4495,15 +5648,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
4495
5648
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
4496
5649
  size_t local_work_size[] = {(size_t)nth, 1, 1};
4497
5650
 
4498
- #ifdef GGML_OPENCL_PROFILING
4499
- cl_event evt;
4500
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4501
-
4502
- g_profiling_info.emplace_back();
4503
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
4504
- #else
4505
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4506
- #endif
5651
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
4507
5652
  }
4508
5653
 
4509
5654
  static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4526,7 +5671,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
4526
5671
  const int ne02 = src0 ? src0->ne[2] : 0;
4527
5672
 
4528
5673
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4529
- cl_command_queue queue = backend_ctx->queue;
4530
5674
 
4531
5675
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4532
5676
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4550,15 +5694,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
4550
5694
  size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
4551
5695
  size_t local_work_size[] = {64, 1, 1};
4552
5696
 
4553
- #ifdef GGML_OPENCL_PROFILING
4554
- cl_event evt;
4555
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4556
-
4557
- g_profiling_info.emplace_back();
4558
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4559
- #else
4560
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4561
- #endif
5697
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4562
5698
  } else {
4563
5699
  kernel = backend_ctx->kernel_diag_mask_inf;
4564
5700
 
@@ -4578,15 +5714,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
4578
5714
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4579
5715
  }
4580
5716
 
4581
- #ifdef GGML_OPENCL_PROFILING
4582
- cl_event evt;
4583
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4584
-
4585
- g_profiling_info.emplace_back();
4586
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4587
- #else
4588
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4589
- #endif
5717
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4590
5718
  }
4591
5719
  }
4592
5720
 
@@ -4606,7 +5734,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
4606
5734
  }
4607
5735
 
4608
5736
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4609
- cl_command_queue queue = backend_ctx->queue;
4610
5737
 
4611
5738
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4612
5739
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4686,15 +5813,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
4686
5813
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
4687
5814
  size_t local_work_size[] = {(size_t)nth, 1, 1};
4688
5815
 
4689
- #ifdef GGML_OPENCL_PROFILING
4690
- cl_event evt;
4691
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4692
-
4693
- g_profiling_info.emplace_back();
4694
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4695
- #else
4696
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4697
- #endif
5816
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4698
5817
  }
4699
5818
 
4700
5819
  static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4706,7 +5825,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
4706
5825
  GGML_ASSERT(dst->extra);
4707
5826
 
4708
5827
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4709
- cl_command_queue queue = backend_ctx->queue;
4710
5828
 
4711
5829
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4712
5830
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -4872,15 +5990,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
4872
5990
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
4873
5991
  size_t local_work_size[] = {(size_t)nth, 1, 1};
4874
5992
 
4875
- #ifdef GGML_OPENCL_PROFILING
4876
- cl_event evt;
4877
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4878
-
4879
- g_profiling_info.emplace_back();
4880
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4881
- #else
4882
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4883
- #endif
5993
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4884
5994
  }
4885
5995
 
4886
5996
  static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4895,7 +6005,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
4895
6005
  GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
4896
6006
 
4897
6007
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4898
- cl_command_queue queue = backend_ctx->queue;
4899
6008
 
4900
6009
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
4901
6010
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4964,15 +6073,192 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
4964
6073
  size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
4965
6074
  size_t local_work_size[] = {256, 1, 1};
4966
6075
 
4967
- #ifdef GGML_OPENCL_PROFILING
4968
- cl_event evt;
4969
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6076
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6077
+ }
4970
6078
 
4971
- g_profiling_info.emplace_back();
4972
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4973
- #else
4974
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4975
- #endif
6079
+ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6080
+ GGML_ASSERT(src0);
6081
+ GGML_ASSERT(src0->extra);
6082
+ GGML_ASSERT(dst);
6083
+ GGML_ASSERT(dst->extra);
6084
+ GGML_UNUSED(src1);
6085
+
6086
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6087
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
6088
+ GGML_ASSERT(ggml_is_contiguous(src0));
6089
+
6090
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6091
+
6092
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6093
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6094
+
6095
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
6096
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
6097
+
6098
+ const int ne00 = src0->ne[0];
6099
+ const int nrows = ggml_nrows(src0);
6100
+
6101
+ int ne00_padded = 1;
6102
+ while (ne00_padded < ne00) {
6103
+ ne00_padded *= 2;
6104
+ }
6105
+
6106
+ int order = (enum ggml_sort_order) dst->op_params[0];
6107
+
6108
+ cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
6109
+
6110
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6111
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6112
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6113
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6114
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
6115
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
6116
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
6117
+ CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
6118
+
6119
+ size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
6120
+ size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
6121
+
6122
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6123
+ }
6124
+
6125
+ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6126
+ GGML_ASSERT(src0);
6127
+ GGML_ASSERT(src0->extra);
6128
+ GGML_ASSERT(dst);
6129
+ GGML_ASSERT(dst->extra);
6130
+ GGML_UNUSED(src1);
6131
+
6132
+ GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
6133
+ GGML_ASSERT(ggml_is_contiguous(src0));
6134
+
6135
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6136
+
6137
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6138
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6139
+
6140
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
6141
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
6142
+
6143
+ const int ne00 = src0->ne[0];
6144
+ const int ne01 = src0->ne[1];
6145
+ const int ne02 = src0->ne[2];
6146
+ const int ne03 = src0->ne[3];
6147
+
6148
+ const cl_ulong nb01 = src0->nb[1];
6149
+ const cl_ulong nb02 = src0->nb[2];
6150
+ const cl_ulong nb03 = src0->nb[3];
6151
+
6152
+ const cl_ulong nb1 = dst->nb[1];
6153
+ const cl_ulong nb2 = dst->nb[2];
6154
+ const cl_ulong nb3 = dst->nb[3];
6155
+
6156
+ cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
6157
+
6158
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6159
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6160
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6161
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6162
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
6163
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
6164
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
6165
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
6166
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
6167
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
6168
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
6169
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
6170
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
6171
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
6172
+
6173
+ size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
6174
+ size_t local_work_size[] = {(size_t)64, 1, 1};
6175
+
6176
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6177
+ }
6178
+
6179
+ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6180
+ GGML_ASSERT(src0);
6181
+ GGML_ASSERT(src0->extra);
6182
+ GGML_ASSERT(dst);
6183
+ GGML_ASSERT(dst->extra);
6184
+
6185
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
6186
+
6187
+ if (src1) {
6188
+ GGML_ASSERT(src1);
6189
+ GGML_ASSERT(src1->extra);
6190
+ GGML_ASSERT(ggml_are_same_shape(src0, src1));
6191
+ }
6192
+
6193
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6194
+
6195
+ cl_kernel kernel;
6196
+ switch (ggml_get_glu_op(dst)) {
6197
+ case GGML_GLU_OP_GEGLU:
6198
+ if (dst->type == GGML_TYPE_F32) {
6199
+ kernel = backend_ctx->kernel_geglu;
6200
+ } else {
6201
+ kernel = backend_ctx->kernel_geglu_f16;
6202
+ }
6203
+ break;
6204
+ case GGML_GLU_OP_REGLU:
6205
+ if (dst->type == GGML_TYPE_F32) {
6206
+ kernel = backend_ctx->kernel_reglu;
6207
+ } else {
6208
+ kernel = backend_ctx->kernel_reglu_f16;
6209
+ }
6210
+ break;
6211
+ case GGML_GLU_OP_SWIGLU:
6212
+ if (dst->type == GGML_TYPE_F32) {
6213
+ kernel = backend_ctx->kernel_swiglu;
6214
+ } else {
6215
+ kernel = backend_ctx->kernel_swiglu_f16;
6216
+ }
6217
+ break;
6218
+ default:
6219
+ GGML_ABORT("Unsupported glu op");
6220
+ }
6221
+
6222
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6223
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6224
+
6225
+ ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
6226
+
6227
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
6228
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
6229
+
6230
+ cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
6231
+
6232
+ const int ne0 = dst->ne[0];
6233
+
6234
+ const cl_ulong nb01 = src0->nb[1];
6235
+ const cl_ulong nb11 = src1 ? src1->nb[1] : nb01;
6236
+
6237
+ const cl_ulong nb1 = dst->nb[1];
6238
+
6239
+ const int swp = ((const int32_t *) dst->op_params)[1];
6240
+ const int ne00_off = src1 ? 0 : (swp ? ne0 : 0);
6241
+ const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
6242
+
6243
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6244
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6245
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), src1 ? &extra1->data_device : &extra0->data_device));
6246
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6247
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6248
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6249
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
6250
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb11));
6251
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
6252
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb1));
6253
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off));
6254
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off));
6255
+
6256
+ const size_t nrows = ggml_nrows(src0);
6257
+ size_t nth = 512;
6258
+ size_t global_work_size[] = {nrows*nth, 1, 1};
6259
+ size_t local_work_size[] = {nth, 1, 1};
6260
+
6261
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4976
6262
  }
4977
6263
 
4978
6264
  //------------------------------------------------------------------------------
@@ -5023,6 +6309,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5023
6309
  }
5024
6310
  func = ggml_cl_mul;
5025
6311
  break;
6312
+ case GGML_OP_DIV:
6313
+ if (!any_on_device) {
6314
+ return false;
6315
+ }
6316
+ func = ggml_cl_div;
6317
+ break;
6318
+ case GGML_OP_SUB:
6319
+ if (!any_on_device) {
6320
+ return false;
6321
+ }
6322
+ func = ggml_cl_sub;
6323
+ break;
5026
6324
  case GGML_OP_UNARY:
5027
6325
  switch (ggml_get_unary_op(tensor)) {
5028
6326
  case GGML_UNARY_OP_GELU:
@@ -5049,9 +6347,27 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5049
6347
  }
5050
6348
  func = ggml_cl_relu;
5051
6349
  break;
6350
+ case GGML_UNARY_OP_SIGMOID:
6351
+ if (!any_on_device) {
6352
+ return false;
6353
+ }
6354
+ func = ggml_cl_sigmoid;
6355
+ break;
6356
+ case GGML_UNARY_OP_TANH:
6357
+ if (!any_on_device) {
6358
+ return false;
6359
+ }
6360
+ func = ggml_cl_tanh;
6361
+ break;
5052
6362
  default:
5053
6363
  return false;
5054
6364
  } break;
6365
+ case GGML_OP_GLU:
6366
+ if (!any_on_device) {
6367
+ return false;
6368
+ }
6369
+ func = ggml_cl_glu;
6370
+ break;
5055
6371
  case GGML_OP_CLAMP:
5056
6372
  if (!any_on_device) {
5057
6373
  return false;
@@ -5070,12 +6386,54 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5070
6386
  }
5071
6387
  func = ggml_cl_rms_norm;
5072
6388
  break;
6389
+ case GGML_OP_GROUP_NORM:
6390
+ if (!any_on_device) {
6391
+ return false;
6392
+ }
6393
+ func = ggml_cl_group_norm;
6394
+ break;
6395
+ case GGML_OP_REPEAT:
6396
+ if (!any_on_device) {
6397
+ return false;
6398
+ }
6399
+ func = ggml_cl_repeat;
6400
+ break;
6401
+ case GGML_OP_PAD:
6402
+ if (!any_on_device) {
6403
+ return false;
6404
+ }
6405
+ ggml_cl_pad(backend, tensor->src[0], tensor);
6406
+ return true;
6407
+ case GGML_OP_UPSCALE:
6408
+ if (!any_on_device) {
6409
+ return false;
6410
+ }
6411
+ ggml_cl_upscale(backend, tensor->src[0], tensor);
6412
+ return true;
6413
+ case GGML_OP_CONCAT:
6414
+ if (!any_on_device) {
6415
+ return false;
6416
+ }
6417
+ func = ggml_cl_concat;
6418
+ break;
6419
+ case GGML_OP_TIMESTEP_EMBEDDING:
6420
+ if (!any_on_device) {
6421
+ return false;
6422
+ }
6423
+ ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
6424
+ return true;
5073
6425
  case GGML_OP_MUL_MAT:
5074
6426
  if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
5075
6427
  return false;
5076
6428
  }
5077
6429
  func = ggml_cl_mul_mat;
5078
6430
  break;
6431
+ case GGML_OP_MUL_MAT_ID:
6432
+ if (!any_on_device) {
6433
+ return false;
6434
+ }
6435
+ func = ggml_cl_mul_mat_id;
6436
+ break;
5079
6437
  case GGML_OP_SCALE:
5080
6438
  if (!any_on_device) {
5081
6439
  return false;
@@ -5115,6 +6473,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5115
6473
  }
5116
6474
  func = ggml_cl_im2col;
5117
6475
  break;
6476
+ case GGML_OP_ARGSORT:
6477
+ if (!any_on_device) {
6478
+ return false;
6479
+ }
6480
+ func = ggml_cl_argsort;
6481
+ break;
6482
+ case GGML_OP_SUM_ROWS:
6483
+ if (!any_on_device) {
6484
+ return false;
6485
+ }
6486
+ func = ggml_cl_sum_rows;
6487
+ break;
5118
6488
  default:
5119
6489
  return false;
5120
6490
  }