@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +56 -22
  11. package/cpp/build-info.cpp +2 -2
  12. package/cpp/llama.cpp/CMakeLists.txt +1 -1
  13. package/cpp/llama.cpp/common/arg.cpp +7 -0
  14. package/cpp/llama.cpp/common/common.cpp +3 -0
  15. package/cpp/llama.cpp/common/common.h +1 -0
  16. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  17. package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
  18. package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
  19. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  20. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  21. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
  22. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  23. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
  24. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  25. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  26. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  27. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  28. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  30. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  32. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  33. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  34. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  35. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  62. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
  64. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
  65. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  66. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
  67. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  68. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  69. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  70. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  71. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  72. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  73. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  74. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  76. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  77. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
  78. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  80. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  81. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  82. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  83. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  89. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  90. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  92. package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
  93. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  94. package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
  95. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
  96. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
  97. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  98. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  99. package/cpp/llama.cpp/include/llama.h +8 -3
  100. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  101. package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
  102. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  103. package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
  104. package/cpp/llama.cpp/src/llama-batch.h +98 -70
  105. package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
  106. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  107. package/cpp/llama.cpp/src/llama-context.h +13 -13
  108. package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
  109. package/cpp/llama.cpp/src/llama-graph.h +44 -32
  110. package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
  111. package/cpp/llama.cpp/src/llama-hparams.h +8 -0
  112. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
  113. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
  114. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
  115. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
  116. package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
  117. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
  118. package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
  119. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
  120. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  121. package/cpp/llama.cpp/src/llama-memory.h +18 -22
  122. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  123. package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
  124. package/cpp/llama.cpp/src/llama-model.h +22 -0
  125. package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
  126. package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
  127. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  128. package/cpp/rn-utils.h +3 -0
  129. package/ios/include/common.h +1 -0
  130. package/ios/include/llama.h +8 -3
  131. package/ios/libs/llama.xcframework/Info.plist +19 -19
  132. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  133. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  134. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  135. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  136. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
  137. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  138. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  139. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  140. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  141. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  142. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  143. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  144. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  145. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  146. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  147. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
  148. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  149. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  150. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
  151. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  152. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  153. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
  154. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  155. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  160. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  161. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  162. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  163. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  164. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
  165. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  168. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  173. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
  175. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
  178. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  183. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  184. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  186. package/package.json +1 -1
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
231
231
  return { type, major, minor, patch };
232
232
  }
233
233
 
234
+ // Profiling
235
+ struct ProfilingInfo {
236
+ std::string op_name;
237
+ std::string kernel_name;
238
+
239
+ cl_kernel kernel;
240
+ cl_event evt;
241
+
242
+ cl_ulong cmd_queued;
243
+ cl_ulong cmd_submit;
244
+ cl_ulong cmd_start;
245
+ cl_ulong cmd_end;
246
+ cl_ulong overhead_start;
247
+ cl_ulong overhead_end;
248
+ // For the times below, see spec for clGetEventProfilingInfo
249
+ // The time kernel spent in cmd queue - SUBMIT - QUEUED
250
+ cl_ulong cmd_queued_duration_ns;
251
+ // The time kernel spent for submission - START - SUBMIT
252
+ cl_ulong cmd_submit_duration_ns;
253
+ // Kernel execution time in nanoseconds - END - START
254
+ cl_ulong cmd_duration_ns;
255
+ // The time for the kernel to complete - COMPLETE - END
256
+ cl_ulong cmd_complete_duration_ns;
257
+ // Total time to finish the kernel - COMPELTE - QUEUED
258
+ cl_ulong cmd_total_duration_ns;
259
+ // Global and local work sizes.
260
+ size_t global_size[3];
261
+ size_t local_size[3];
262
+ // Op output size.
263
+ size_t output_size[4];
264
+ };
265
+
266
+ static void populateProfilingInfo(
267
+ ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
268
+ size_t global_size[3], size_t local_size[3],
269
+ const ggml_tensor * tensor) {
270
+ info.op_name = tensor->name;
271
+ info.kernel = kernel;
272
+ info.evt = evt;
273
+
274
+ // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
275
+ info.local_size[0] = 0;
276
+ info.local_size[1] = 0;
277
+ info.local_size[2] = 0;
278
+
279
+ info.global_size[0] = 0;
280
+ info.global_size[1] = 0;
281
+ info.global_size[2] = 0;
282
+
283
+ if (local_size) {
284
+ for (cl_uint i = 0; i < work_dim; ++i) {
285
+ info.local_size[i] = local_size[i];
286
+ }
287
+ }
288
+
289
+ for (cl_uint i = 0; i < work_dim; ++i) {
290
+ info.global_size[i] = global_size[i];
291
+ }
292
+
293
+ info.output_size[0] = tensor->ne[0];
294
+ info.output_size[1] = tensor->ne[1];
295
+ info.output_size[2] = tensor->ne[2];
296
+ info.output_size[3] = tensor->ne[3];
297
+ }
298
+
234
299
  struct ggml_backend_opencl_context;
235
300
 
236
301
  // backend device context
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
254
319
 
255
320
  // backend context
256
321
  struct ggml_backend_opencl_context {
322
+ int ref_count;
323
+
257
324
  cl_device_id device;
258
325
  std::string device_name;
259
326
 
@@ -369,6 +436,108 @@ struct ggml_backend_opencl_context {
369
436
  cl_kernel kernel_timestep_embedding;
370
437
  cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
371
438
 
439
+ std::vector<ProfilingInfo> profiling_info;
440
+
441
+ void write_profiling_info() {
442
+ FILE * fperf = fopen("cl_profiling.csv", "w");
443
+ if (!fperf) {
444
+ GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
445
+ return;
446
+ }
447
+
448
+ // Populate profiling info
449
+ for (ProfilingInfo & info : profiling_info) {
450
+ cl_ulong cmd_queued;
451
+ cl_ulong cmd_submit;
452
+ cl_ulong cmd_start;
453
+ cl_ulong cmd_end;
454
+ cl_ulong cmd_complete;
455
+
456
+ CL_CHECK(clWaitForEvents(1, &info.evt));
457
+ CL_CHECK(clGetEventProfilingInfo(
458
+ info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
459
+ CL_CHECK(clGetEventProfilingInfo(
460
+ info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
461
+ CL_CHECK(clGetEventProfilingInfo(
462
+ info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
463
+ CL_CHECK(clGetEventProfilingInfo(
464
+ info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
465
+ CL_CHECK(clGetEventProfilingInfo(
466
+ info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
467
+ CL_CHECK(clReleaseEvent(info.evt));
468
+
469
+ char kernel_name[512];
470
+ CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
471
+ sizeof(kernel_name), kernel_name, NULL));
472
+ info.kernel_name = kernel_name;
473
+
474
+ info.cmd_queued = cmd_queued;
475
+ info.cmd_submit = cmd_submit;
476
+ info.cmd_start = cmd_start;
477
+ info.cmd_end = cmd_end;
478
+
479
+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
480
+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
481
+ info.cmd_duration_ns = cmd_end - cmd_start;
482
+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
483
+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
484
+ }
485
+
486
+ // Dump a csv
487
+ float total_kernel_time = 0;
488
+ fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
489
+ for (const ProfilingInfo & info : profiling_info) {
490
+ total_kernel_time += info.cmd_duration_ns/1.e6f;
491
+ fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
492
+ info.op_name.c_str(), info.kernel_name.c_str(),
493
+ info.cmd_queued_duration_ns/1.e6f,
494
+ info.cmd_submit_duration_ns/1.e6f,
495
+ info.cmd_duration_ns/1.e6f,
496
+ info.cmd_complete_duration_ns/1.e6f,
497
+ info.cmd_total_duration_ns/1.e6f,
498
+ info.global_size[0], info.global_size[1], info.global_size[2],
499
+ info.local_size[0], info.local_size[1], info.local_size[2],
500
+ info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
501
+ }
502
+ fclose(fperf);
503
+
504
+ GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
505
+
506
+ // Dump a simple chrome trace
507
+ FILE* ftrace = fopen("cl_trace.json", "w");
508
+ if (!ftrace) {
509
+ GGML_LOG_ERROR("Failed to open cl_trace.json\n");
510
+ return;
511
+ }
512
+
513
+ fprintf(ftrace, "[\n");
514
+ for (const ProfilingInfo & info : profiling_info) {
515
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
516
+ info.kernel_name.c_str(), info.cmd_queued/1000);
517
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
518
+ info.kernel_name.c_str(), info.cmd_submit/1000);
519
+
520
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
521
+ info.kernel_name.c_str(), info.cmd_start/1000);
522
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
523
+ info.kernel_name.c_str(), info.cmd_end/1000);
524
+ }
525
+ fclose(ftrace);
526
+ }
527
+
528
+ void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
529
+ #ifdef GGML_OPENCL_PROFILING
530
+ cl_event evt;
531
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
532
+
533
+ profiling_info.emplace_back();
534
+ populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
535
+ #else
536
+ GGML_UNUSED(tensor);
537
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
538
+ #endif
539
+ }
540
+
372
541
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
373
542
  // Transpose kernels
374
543
  cl_program program_transpose;
@@ -395,46 +564,19 @@ struct ggml_backend_opencl_context {
395
564
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
396
565
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
397
566
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
398
- };
399
567
 
400
- // All registered devices with a default device in the front.
401
- static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
402
-
403
- // Profiling
568
+ void free() {
569
+ ref_count--;
570
+ if (ref_count == 0) {
404
571
  #ifdef GGML_OPENCL_PROFILING
405
- struct ProfilingInfo {
406
- std::string op_name;
407
- std::string kernel_name;
408
-
409
- cl_kernel kernel;
410
- cl_event evt;
411
-
412
- cl_ulong cmd_queued;
413
- cl_ulong cmd_submit;
414
- cl_ulong cmd_start;
415
- cl_ulong cmd_end;
416
- cl_ulong overhead_start;
417
- cl_ulong overhead_end;
418
- // For the times below, see spec for clGetEventProfilingInfo
419
- // The time kernel spent in cmd queue - SUBMIT - QUEUED
420
- cl_ulong cmd_queued_duration_ns;
421
- // The time kernel spent for submission - START - SUBMIT
422
- cl_ulong cmd_submit_duration_ns;
423
- // Kernel execution time in nanoseconds - END - START
424
- cl_ulong cmd_duration_ns;
425
- // The time for the kernel to complete - COMPLETE - END
426
- cl_ulong cmd_complete_duration_ns;
427
- // Total time to finish the kernel - COMPELTE - QUEUED
428
- cl_ulong cmd_total_duration_ns;
429
- // Global and local work sizes.
430
- size_t global_size[3];
431
- size_t local_size[3];
432
- // Op output size.
433
- size_t output_size[4];
572
+ write_profiling_info();
573
+ #endif
574
+ }
575
+ }
434
576
  };
435
577
 
436
- std::vector<ProfilingInfo> g_profiling_info;
437
- #endif
578
+ // All registered devices with a default device in the front.
579
+ static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
438
580
 
439
581
  inline std::string read_file(const std::string &path) {
440
582
  std::ifstream ifs(path);
@@ -1669,6 +1811,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1669
1811
  backend_ctx->device = dev_ctx->device;
1670
1812
  backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1671
1813
 
1814
+ // ref_count get increased in ggml_backend_opencl_device_init
1815
+ // This function is also used to retrieve backend context, so we don't want
1816
+ // to increase ref_count for each call. We only want to increase ref_count
1817
+ // when the associated device is initialized
1818
+ backend_ctx->ref_count = 0;
1819
+
1672
1820
  if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
1673
1821
  strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
1674
1822
  strstr(dev_ctx->device_version.c_str(), "Adreno")) {
@@ -1841,93 +1989,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1841
1989
  return dev_ctx->backend_ctx;
1842
1990
  }
1843
1991
 
1844
- static void ggml_cl2_free(void) {
1845
- #ifdef GGML_OPENCL_PROFILING
1846
- FILE * fperf = fopen("cl_profiling.csv", "w");
1847
- if (!fperf) {
1848
- GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
1849
- return;
1850
- }
1992
+ static void ggml_cl2_free(ggml_backend_t backend) {
1993
+ ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
1994
+ ctx->free();
1851
1995
 
1852
- // Populate profiling info
1853
- for (ProfilingInfo & info : g_profiling_info) {
1854
- cl_ulong cmd_queued;
1855
- cl_ulong cmd_submit;
1856
- cl_ulong cmd_start;
1857
- cl_ulong cmd_end;
1858
- cl_ulong cmd_complete;
1859
-
1860
- CL_CHECK(clWaitForEvents(1, &info.evt));
1861
- CL_CHECK(clGetEventProfilingInfo(
1862
- info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
1863
- CL_CHECK(clGetEventProfilingInfo(
1864
- info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
1865
- CL_CHECK(clGetEventProfilingInfo(
1866
- info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
1867
- CL_CHECK(clGetEventProfilingInfo(
1868
- info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
1869
- CL_CHECK(clGetEventProfilingInfo(
1870
- info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
1871
- CL_CHECK(clReleaseEvent(info.evt));
1872
-
1873
- char kernel_name[512];
1874
- CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
1875
- sizeof(kernel_name), kernel_name, NULL));
1876
- info.kernel_name = kernel_name;
1877
-
1878
- info.cmd_queued = cmd_queued;
1879
- info.cmd_submit = cmd_submit;
1880
- info.cmd_start = cmd_start;
1881
- info.cmd_end = cmd_end;
1882
-
1883
- info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
1884
- info.cmd_submit_duration_ns = cmd_start - cmd_submit;
1885
- info.cmd_duration_ns = cmd_end - cmd_start;
1886
- info.cmd_complete_duration_ns = cmd_complete - cmd_end;
1887
- info.cmd_total_duration_ns = cmd_complete - cmd_queued;
1888
- }
1889
-
1890
- // Dump a csv
1891
- float total_kernel_time = 0;
1892
- fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
1893
- for (const ProfilingInfo & info : g_profiling_info) {
1894
- total_kernel_time += info.cmd_duration_ns/1.e6f;
1895
- fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
1896
- info.op_name.c_str(), info.kernel_name.c_str(),
1897
- info.cmd_queued_duration_ns/1.e6f,
1898
- info.cmd_submit_duration_ns/1.e6f,
1899
- info.cmd_duration_ns/1.e6f,
1900
- info.cmd_complete_duration_ns/1.e6f,
1901
- info.cmd_total_duration_ns/1.e6f,
1902
- info.global_size[0], info.global_size[1], info.global_size[2],
1903
- info.local_size[0], info.local_size[1], info.local_size[2],
1904
- info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
1905
- }
1906
- fclose(fperf);
1907
-
1908
- GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
1909
-
1910
- // Dump a simple chrome trace
1911
- FILE* ftrace = fopen("cl_trace.json", "w");
1912
- if (!ftrace) {
1913
- GGML_LOG_ERROR("Failed to open cl_trace.json\n");
1914
- return;
1996
+ // The CL context is shared by all backends, release it if all backends have been released
1997
+ bool should_release_opencl = true;
1998
+ for (auto device : g_ggml_backend_opencl_devices) {
1999
+ ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
2000
+ if (ctx_dev->backend_ctx->ref_count > 0) {
2001
+ should_release_opencl = false;
2002
+ }
1915
2003
  }
1916
2004
 
1917
- fprintf(ftrace, "[\n");
1918
- for (const ProfilingInfo & info : g_profiling_info) {
1919
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
1920
- info.kernel_name.c_str(), info.cmd_queued/1000);
1921
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
1922
- info.kernel_name.c_str(), info.cmd_submit/1000);
1923
-
1924
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1925
- info.kernel_name.c_str(), info.cmd_start/1000);
1926
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1927
- info.kernel_name.c_str(), info.cmd_end/1000);
2005
+ if (should_release_opencl) {
2006
+ CL_CHECK(clReleaseContext(ctx->context));
1928
2007
  }
1929
- fclose(ftrace);
1930
- #endif
1931
2008
  }
1932
2009
 
1933
2010
  //------------------------------------------------------------------------------
@@ -2011,9 +2088,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
2011
2088
  }
2012
2089
 
2013
2090
  static void ggml_backend_opencl_free(ggml_backend_t backend) {
2014
- ggml_cl2_free();
2015
-
2016
- GGML_UNUSED(backend);
2091
+ ggml_cl2_free(backend);
2017
2092
  }
2018
2093
 
2019
2094
  static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -2899,6 +2974,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
2899
2974
 
2900
2975
  static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
2901
2976
  ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
2977
+ // Getting a new reference to the backend, increase ref_count
2978
+ backend_ctx->ref_count++;
2902
2979
 
2903
2980
  ggml_backend_t backend = new ggml_backend {
2904
2981
  /* .guid = */ ggml_backend_opencl_guid(),
@@ -3159,31 +3236,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
3159
3236
  #define dump_tensor(tensor)
3160
3237
  #endif
3161
3238
 
3162
- //------------------------------------------------------------------------------
3163
- // Profiling utility
3164
- //------------------------------------------------------------------------------
3165
- #ifdef GGML_OPENCL_PROFILING
3166
- static void populateProfilingInfo(
3167
- ProfilingInfo& info, cl_event evt, cl_kernel kernel,
3168
- size_t global_size[3], size_t local_size[3],
3169
- const ggml_tensor * tensor) {
3170
- info.op_name = tensor->name;
3171
- info.kernel = kernel;
3172
- info.evt = evt;
3173
-
3174
- info.local_size[0] = local_size[0];
3175
- info.local_size[1] = local_size[1];
3176
- info.local_size[2] = local_size[2];
3177
- info.global_size[0] = global_size[0];
3178
- info.global_size[1] = global_size[1];
3179
- info.global_size[2] = global_size[2];
3180
- info.output_size[0] = tensor->ne[0];
3181
- info.output_size[1] = tensor->ne[1];
3182
- info.output_size[2] = tensor->ne[2];
3183
- info.output_size[3] = tensor->ne[3];
3184
- }
3185
- #endif
3186
-
3187
3239
  //------------------------------------------------------------------------------
3188
3240
  // Ops
3189
3241
  //------------------------------------------------------------------------------
@@ -3227,7 +3279,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3227
3279
  const cl_ulong nb2 = dst ? dst->nb[2] : 0;
3228
3280
 
3229
3281
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3230
- cl_command_queue queue = backend_ctx->queue;
3231
3282
 
3232
3283
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3233
3284
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3271,15 +3322,7 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3271
3322
  size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
3272
3323
  size_t local_work_size[] = {1, 1, 1};
3273
3324
 
3274
- #ifdef GGML_OPENCL_PROFILING
3275
- cl_event evt;
3276
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3277
-
3278
- g_profiling_info.emplace_back();
3279
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3280
- #else
3281
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3282
- #endif
3325
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3283
3326
  }
3284
3327
 
3285
3328
  static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3321,7 +3364,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3321
3364
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3322
3365
 
3323
3366
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3324
- cl_command_queue queue = backend_ctx->queue;
3325
3367
 
3326
3368
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3327
3369
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3396,29 +3438,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3396
3438
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3397
3439
  }
3398
3440
 
3399
- #ifdef GGML_OPENCL_PROFILING
3400
- cl_event evt;
3401
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3402
-
3403
- g_profiling_info.emplace_back();
3404
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3405
- #else
3406
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3407
- #endif
3441
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3408
3442
  } else {
3409
3443
  unsigned int nth = MIN(64, ne0);
3410
3444
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3411
3445
  size_t local_work_size[] = {nth, 1, 1};
3412
3446
 
3413
- #ifdef GGML_OPENCL_PROFILING
3414
- cl_event evt;
3415
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3416
-
3417
- g_profiling_info.emplace_back();
3418
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3419
- #else
3420
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3421
- #endif
3447
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3422
3448
  }
3423
3449
  }
3424
3450
 
@@ -3461,7 +3487,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3461
3487
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3462
3488
 
3463
3489
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3464
- cl_command_queue queue = backend_ctx->queue;
3465
3490
 
3466
3491
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3467
3492
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3536,29 +3561,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3536
3561
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3537
3562
  }
3538
3563
 
3539
- #ifdef GGML_OPENCL_PROFILING
3540
- cl_event evt;
3541
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3542
-
3543
- g_profiling_info.emplace_back();
3544
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3545
- #else
3546
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3547
- #endif
3564
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3548
3565
  } else {
3549
3566
  unsigned int nth = MIN(64, ne0);
3550
3567
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3551
3568
  size_t local_work_size[] = {nth, 1, 1};
3552
3569
 
3553
- #ifdef GGML_OPENCL_PROFILING
3554
- cl_event evt;
3555
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3556
-
3557
- g_profiling_info.emplace_back();
3558
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3559
- #else
3560
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3561
- #endif
3570
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3562
3571
  }
3563
3572
  }
3564
3573
 
@@ -3598,7 +3607,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3598
3607
  const cl_ulong nb3 = dst->nb[3];
3599
3608
 
3600
3609
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3601
- cl_command_queue queue = backend_ctx->queue;
3602
3610
 
3603
3611
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3604
3612
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3661,29 +3669,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3661
3669
  size_t global_work_size[] = {(size_t)n, 1, 1};
3662
3670
  size_t local_work_size[] = {64, 1, 1};
3663
3671
 
3664
- #ifdef GGML_OPENCL_PROFILING
3665
- cl_event evt;
3666
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3667
-
3668
- g_profiling_info.emplace_back();
3669
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3670
- #else
3671
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3672
- #endif
3672
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3673
3673
  } else {
3674
3674
  unsigned int nth = MIN(64, ne0);
3675
3675
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3676
3676
  size_t local_work_size[] = {nth, 1, 1};
3677
3677
 
3678
- #ifdef GGML_OPENCL_PROFILING
3679
- cl_event evt;
3680
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3681
-
3682
- g_profiling_info.emplace_back();
3683
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3684
- #else
3685
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3686
- #endif
3678
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3687
3679
  }
3688
3680
  }
3689
3681
 
@@ -3723,7 +3715,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
3723
3715
  const cl_ulong nb3 = dst->nb[3];
3724
3716
 
3725
3717
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3726
- cl_command_queue queue = backend_ctx->queue;
3727
3718
 
3728
3719
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3729
3720
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3786,29 +3777,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
3786
3777
  size_t global_work_size[] = {(size_t)n, 1, 1};
3787
3778
  size_t local_work_size[] = {64, 1, 1};
3788
3779
 
3789
- #ifdef GGML_OPENCL_PROFILING
3790
- cl_event evt;
3791
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3792
-
3793
- g_profiling_info.emplace_back();
3794
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3795
- #else
3796
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3797
- #endif
3780
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3798
3781
  } else {
3799
3782
  unsigned int nth = MIN(64, ne0);
3800
3783
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3801
3784
  size_t local_work_size[] = {nth, 1, 1};
3802
3785
 
3803
- #ifdef GGML_OPENCL_PROFILING
3804
- cl_event evt;
3805
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3806
-
3807
- g_profiling_info.emplace_back();
3808
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3809
- #else
3810
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3811
- #endif
3786
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3812
3787
  }
3813
3788
  }
3814
3789
 
@@ -3821,7 +3796,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3821
3796
  UNUSED(src1);
3822
3797
 
3823
3798
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3824
- cl_command_queue queue = backend_ctx->queue;
3825
3799
 
3826
3800
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3827
3801
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3848,15 +3822,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3848
3822
  size_t global_work_size[] = {(size_t)n, 1, 1};
3849
3823
  size_t local_work_size[] = {64, 1, 1};
3850
3824
 
3851
- #ifdef GGML_OPENCL_PROFILING
3852
- cl_event evt;
3853
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3854
-
3855
- g_profiling_info.emplace_back();
3856
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3857
- #else
3858
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3859
- #endif
3825
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3860
3826
  }
3861
3827
 
3862
3828
  static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3868,7 +3834,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
3868
3834
  UNUSED(src1);
3869
3835
 
3870
3836
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3871
- cl_command_queue queue = backend_ctx->queue;
3872
3837
 
3873
3838
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3874
3839
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3895,15 +3860,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
3895
3860
  size_t global_work_size[] = {(size_t)n, 1, 1};
3896
3861
  size_t local_work_size[] = {64, 1, 1};
3897
3862
 
3898
- #ifdef GGML_OPENCL_PROFILING
3899
- cl_event evt;
3900
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3901
-
3902
- g_profiling_info.emplace_back();
3903
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3904
- #else
3905
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3906
- #endif
3863
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3907
3864
  }
3908
3865
 
3909
3866
  static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3915,7 +3872,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3915
3872
  UNUSED(src1);
3916
3873
 
3917
3874
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3918
- cl_command_queue queue = backend_ctx->queue;
3919
3875
 
3920
3876
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3921
3877
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3947,15 +3903,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3947
3903
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3948
3904
  }
3949
3905
 
3950
- #ifdef GGML_OPENCL_PROFILING
3951
- cl_event evt;
3952
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3953
-
3954
- g_profiling_info.emplace_back();
3955
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3956
- #else
3957
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3958
- #endif
3906
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3959
3907
  }
3960
3908
 
3961
3909
  static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3967,7 +3915,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3967
3915
  UNUSED(src1);
3968
3916
 
3969
3917
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3970
- cl_command_queue queue = backend_ctx->queue;
3971
3918
 
3972
3919
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3973
3920
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3992,15 +3939,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3992
3939
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3993
3940
  }
3994
3941
 
3995
- #ifdef GGML_OPENCL_PROFILING
3996
- cl_event evt;
3997
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3998
-
3999
- g_profiling_info.emplace_back();
4000
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4001
- #else
4002
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4003
- #endif
3942
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4004
3943
  }
4005
3944
 
4006
3945
  static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4012,7 +3951,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
4012
3951
  UNUSED(src1);
4013
3952
 
4014
3953
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4015
- cl_command_queue queue = backend_ctx->queue;
4016
3954
 
4017
3955
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4018
3956
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4044,15 +3982,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
4044
3982
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4045
3983
  }
4046
3984
 
4047
- #ifdef GGML_OPENCL_PROFILING
4048
- cl_event evt;
4049
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4050
-
4051
- g_profiling_info.emplace_back();
4052
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4053
- #else
4054
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4055
- #endif
3985
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4056
3986
  }
4057
3987
 
4058
3988
  static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4064,7 +3994,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
4064
3994
  UNUSED(src1);
4065
3995
 
4066
3996
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4067
- cl_command_queue queue = backend_ctx->queue;
4068
3997
 
4069
3998
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4070
3999
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4096,15 +4025,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
4096
4025
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4097
4026
  }
4098
4027
 
4099
- #ifdef GGML_OPENCL_PROFILING
4100
- cl_event evt;
4101
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4102
-
4103
- g_profiling_info.emplace_back();
4104
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4105
- #else
4106
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4107
- #endif
4028
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4108
4029
  }
4109
4030
 
4110
4031
  static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4116,7 +4037,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
4116
4037
  UNUSED(src1);
4117
4038
 
4118
4039
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4119
- cl_command_queue queue = backend_ctx->queue;
4120
4040
 
4121
4041
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4122
4042
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4157,15 +4077,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
4157
4077
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
4158
4078
  size_t local_work_size[] = {(size_t)nth, 1, 1};
4159
4079
 
4160
- #ifdef GGML_OPENCL_PROFILING
4161
- cl_event evt;
4162
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4163
-
4164
- g_profiling_info.emplace_back();
4165
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4166
- #else
4167
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4168
- #endif
4080
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4169
4081
  }
4170
4082
 
4171
4083
  static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4177,7 +4089,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
4177
4089
  UNUSED(src1);
4178
4090
 
4179
4091
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4180
- cl_command_queue queue = backend_ctx->queue;
4181
4092
 
4182
4093
  //ggml_backend_opencl_device_context * dev_ctx =
4183
4094
  // (ggml_backend_opencl_device_context *)backend->device->context;
@@ -4241,15 +4152,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
4241
4152
  // This is local memory - the size depends on subgroup size.
4242
4153
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
4243
4154
 
4244
- #ifdef GGML_OPENCL_PROFILING
4245
- cl_event evt;
4246
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4247
-
4248
- g_profiling_info.emplace_back();
4249
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4250
- #else
4251
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4252
- #endif
4155
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4253
4156
  }
4254
4157
 
4255
4158
  static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4261,7 +4164,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
4261
4164
  UNUSED(src1);
4262
4165
 
4263
4166
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4264
- cl_command_queue queue = backend_ctx->queue;
4265
4167
 
4266
4168
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4267
4169
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4300,15 +4202,7 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
4300
4202
  size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
4301
4203
  size_t local_work_size[] = {(size_t)sgs, 1, 1};
4302
4204
 
4303
- #ifdef GGML_OPENCL_PROFILING
4304
- cl_event evt;
4305
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4306
-
4307
- g_profiling_info.emplace_back();
4308
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4309
- #else
4310
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4311
- #endif
4205
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4312
4206
  }
4313
4207
 
4314
4208
  static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4320,7 +4214,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
4320
4214
  UNUSED(src1);
4321
4215
 
4322
4216
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4323
- cl_command_queue queue = backend_ctx->queue;
4324
4217
 
4325
4218
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4326
4219
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4397,16 +4290,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
4397
4290
  }
4398
4291
  if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
4399
4292
 
4400
-
4401
- #ifdef GGML_OPENCL_PROFILING
4402
- cl_event evt;
4403
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4404
-
4405
- g_profiling_info.emplace_back();
4406
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
4407
- #else
4408
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4409
- #endif
4293
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4410
4294
  }
4411
4295
 
4412
4296
  static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
@@ -4419,7 +4303,6 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
4419
4303
  UNUSED(src1_shape_def);
4420
4304
 
4421
4305
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4422
- cl_command_queue queue = backend_ctx->queue;
4423
4306
 
4424
4307
  if (backend_ctx->kernel_repeat == nullptr) {
4425
4308
  GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
@@ -4467,15 +4350,7 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
4467
4350
 
4468
4351
  size_t global_work_size[] = { gws0, gws1, gws2 };
4469
4352
 
4470
- #ifdef GGML_OPENCL_PROFILING
4471
- cl_event evt;
4472
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt));
4473
-
4474
- g_profiling_info.emplace_back();
4475
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst);
4476
- #else
4477
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
4478
- #endif
4353
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4479
4354
  }
4480
4355
 
4481
4356
  static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -4488,7 +4363,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
4488
4363
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
4489
4364
 
4490
4365
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4491
- cl_command_queue queue = backend_ctx->queue;
4492
4366
 
4493
4367
  if (backend_ctx->kernel_pad == nullptr) {
4494
4368
  GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
@@ -4533,15 +4407,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
4533
4407
  local_work_size_ptr = nullptr;
4534
4408
  }
4535
4409
 
4536
- #ifdef GGML_OPENCL_PROFILING
4537
- cl_event evt;
4538
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4539
-
4540
- g_profiling_info.emplace_back();
4541
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
4542
- #else
4543
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4544
- #endif
4410
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4545
4411
  }
4546
4412
 
4547
4413
  static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -4553,7 +4419,6 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4553
4419
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
4554
4420
 
4555
4421
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4556
- cl_command_queue queue = backend_ctx->queue;
4557
4422
 
4558
4423
  const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
4559
4424
  cl_kernel kernel = nullptr;
@@ -4644,17 +4509,7 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4644
4509
  local_work_size_ptr = nullptr;
4645
4510
  }
4646
4511
 
4647
- #ifdef GGML_OPENCL_PROFILING
4648
- cl_event evt;
4649
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4650
-
4651
- g_profiling_info.emplace_back();
4652
- size_t profiling_gws[3] = {global_work_size[0], 1, 1};
4653
- size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1};
4654
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
4655
- #else
4656
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4657
- #endif
4512
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4658
4513
  }
4659
4514
 
4660
4515
  static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4732,7 +4587,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
4732
4587
  global_work_size[1] = d_ne1;
4733
4588
  global_work_size[2] = d_ne2;
4734
4589
 
4735
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
4590
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4736
4591
  }
4737
4592
  }
4738
4593
  } else {
@@ -4782,7 +4637,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
4782
4637
  d_ne2 > 0 ? (size_t)d_ne2 : 1,
4783
4638
  d_ne3 > 0 ? (size_t)d_ne3 : 1 };
4784
4639
 
4785
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL));
4640
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
4786
4641
  }
4787
4642
  }
4788
4643
 
@@ -4795,7 +4650,6 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
4795
4650
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
4796
4651
 
4797
4652
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4798
- cl_command_queue queue = backend_ctx->queue;
4799
4653
 
4800
4654
  if (backend_ctx->kernel_timestep_embedding == nullptr) {
4801
4655
  GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
@@ -4828,17 +4682,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
4828
4682
 
4829
4683
  size_t global_work_size[] = {gws0, gws1, 1};
4830
4684
 
4831
- #ifdef GGML_OPENCL_PROFILING
4832
- cl_event evt;
4833
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
4834
-
4835
- g_profiling_info.emplace_back();
4836
- size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1};
4837
- size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS
4838
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
4839
- #else
4840
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem
4841
- #endif
4685
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4842
4686
  }
4843
4687
 
4844
4688
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4853,7 +4697,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4853
4697
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
4854
4698
 
4855
4699
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4856
- cl_command_queue queue = backend_ctx->queue;
4857
4700
 
4858
4701
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4859
4702
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -5058,15 +4901,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5058
4901
  static_cast<size_t>(padded_height_B)
5059
4902
  };
5060
4903
 
5061
- #ifdef GGML_OPENCL_PROFILING
5062
- cl_event evt;
5063
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
5064
-
5065
- g_profiling_info.emplace_back();
5066
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
5067
- #else
5068
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
5069
- #endif
4904
+ backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
5070
4905
  } else {
5071
4906
  // no need to transpose B in other cases
5072
4907
  // create an image for B from sub_buffer
@@ -5188,16 +5023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5188
5023
 
5189
5024
  // enqueue kernel with profiling
5190
5025
  // <--------------------------------------------> //
5191
- #ifdef GGML_OPENCL_PROFILING
5192
- cl_event evt;
5193
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5194
-
5195
- g_profiling_info.emplace_back();
5196
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5197
- // enqueue kernel without profiling
5198
- #else
5199
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5200
- #endif
5026
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5201
5027
  // <--------------------------------------------> //
5202
5028
 
5203
5029
  // deallocate sub buffers and images
@@ -5277,15 +5103,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5277
5103
  global_work_size[2] = (size_t)ne12*ne13;
5278
5104
  }
5279
5105
 
5280
- #ifdef GGML_OPENCL_PROFILING
5281
- cl_event evt;
5282
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5283
-
5284
- g_profiling_info.emplace_back();
5285
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5286
- #else
5287
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5288
- #endif
5106
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5289
5107
  return;
5290
5108
  }
5291
5109
  #else // GGML_OPENCL_SOA_Q
@@ -5515,15 +5333,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5515
5333
  size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
5516
5334
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5517
5335
 
5518
- #ifdef GGML_OPENCL_PROFILING
5519
- cl_event evt;
5520
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5521
-
5522
- g_profiling_info.emplace_back();
5523
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5524
- #else
5525
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5526
- #endif
5336
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5527
5337
  } else if (src0t == GGML_TYPE_Q4_K) {
5528
5338
  GGML_ASSERT(false && "not implemented");
5529
5339
  } else if (src0t == GGML_TYPE_Q3_K) {
@@ -5534,30 +5344,14 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5534
5344
  size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
5535
5345
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5536
5346
 
5537
- #ifdef GGML_OPENCL_PROFILING
5538
- cl_event evt;
5539
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5540
-
5541
- g_profiling_info.emplace_back();
5542
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5543
- #else
5544
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5545
- #endif
5347
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5546
5348
  } else {
5547
5349
  int64_t ny = (ne11 + nrows - 1)/nrows;
5548
5350
 
5549
5351
  size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
5550
5352
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5551
5353
 
5552
- #ifdef GGML_OPENCL_PROFILING
5553
- cl_event evt;
5554
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5555
-
5556
- g_profiling_info.emplace_back();
5557
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5558
- #else
5559
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5560
- #endif
5354
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5561
5355
  }
5562
5356
  }
5563
5357
 
@@ -5574,7 +5368,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
5574
5368
  GGML_ASSERT(src2->extra);
5575
5369
 
5576
5370
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5577
- cl_command_queue queue = backend_ctx->queue;
5578
5371
 
5579
5372
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5580
5373
  ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
@@ -5680,15 +5473,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
5680
5473
  size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
5681
5474
  size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
5682
5475
 
5683
- #ifdef GGML_OPENCL_PROFILING
5684
- cl_event evt;
5685
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5686
-
5687
- g_profiling_info.emplace_back();
5688
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5689
- #else
5690
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5691
- #endif
5476
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5692
5477
  }
5693
5478
 
5694
5479
  static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5701,7 +5486,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
5701
5486
  GGML_ASSERT(ggml_is_contiguous(src0));
5702
5487
 
5703
5488
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5704
- cl_command_queue queue = backend_ctx->queue;
5705
5489
 
5706
5490
  float scale;
5707
5491
  memcpy(&scale, dst->op_params, sizeof(scale));
@@ -5730,15 +5514,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
5730
5514
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5731
5515
  }
5732
5516
 
5733
- #ifdef GGML_OPENCL_PROFILING
5734
- cl_event evt;
5735
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
5736
-
5737
- g_profiling_info.emplace_back();
5738
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
5739
- #else
5740
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
5741
- #endif
5517
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
5742
5518
  }
5743
5519
 
5744
5520
  static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5775,7 +5551,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
5775
5551
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
5776
5552
 
5777
5553
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5778
- cl_command_queue queue = backend_ctx->queue;
5779
5554
 
5780
5555
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5781
5556
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -5840,15 +5615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
5840
5615
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5841
5616
  size_t local_work_size[] = {(size_t)nth, 1, 1};
5842
5617
 
5843
- #ifdef GGML_OPENCL_PROFILING
5844
- cl_event evt;
5845
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5846
-
5847
- g_profiling_info.emplace_back();
5848
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
5849
- #else
5850
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5851
- #endif
5618
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
5852
5619
  }
5853
5620
 
5854
5621
  static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5871,7 +5638,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5871
5638
  const int ne02 = src0 ? src0->ne[2] : 0;
5872
5639
 
5873
5640
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5874
- cl_command_queue queue = backend_ctx->queue;
5875
5641
 
5876
5642
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5877
5643
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5895,15 +5661,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5895
5661
  size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
5896
5662
  size_t local_work_size[] = {64, 1, 1};
5897
5663
 
5898
- #ifdef GGML_OPENCL_PROFILING
5899
- cl_event evt;
5900
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5901
-
5902
- g_profiling_info.emplace_back();
5903
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5904
- #else
5905
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5906
- #endif
5664
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5907
5665
  } else {
5908
5666
  kernel = backend_ctx->kernel_diag_mask_inf;
5909
5667
 
@@ -5923,15 +5681,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5923
5681
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5924
5682
  }
5925
5683
 
5926
- #ifdef GGML_OPENCL_PROFILING
5927
- cl_event evt;
5928
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
5929
-
5930
- g_profiling_info.emplace_back();
5931
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
5932
- #else
5933
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
5934
- #endif
5684
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
5935
5685
  }
5936
5686
  }
5937
5687
 
@@ -5951,7 +5701,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
5951
5701
  }
5952
5702
 
5953
5703
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5954
- cl_command_queue queue = backend_ctx->queue;
5955
5704
 
5956
5705
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5957
5706
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6031,15 +5780,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
6031
5780
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
6032
5781
  size_t local_work_size[] = {(size_t)nth, 1, 1};
6033
5782
 
6034
- #ifdef GGML_OPENCL_PROFILING
6035
- cl_event evt;
6036
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6037
-
6038
- g_profiling_info.emplace_back();
6039
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6040
- #else
6041
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6042
- #endif
5783
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6043
5784
  }
6044
5785
 
6045
5786
  static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6051,7 +5792,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
6051
5792
  GGML_ASSERT(dst->extra);
6052
5793
 
6053
5794
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6054
- cl_command_queue queue = backend_ctx->queue;
6055
5795
 
6056
5796
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6057
5797
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -6217,15 +5957,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
6217
5957
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
6218
5958
  size_t local_work_size[] = {(size_t)nth, 1, 1};
6219
5959
 
6220
- #ifdef GGML_OPENCL_PROFILING
6221
- cl_event evt;
6222
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6223
-
6224
- g_profiling_info.emplace_back();
6225
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6226
- #else
6227
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6228
- #endif
5960
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6229
5961
  }
6230
5962
 
6231
5963
  static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6240,7 +5972,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
6240
5972
  GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
6241
5973
 
6242
5974
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6243
- cl_command_queue queue = backend_ctx->queue;
6244
5975
 
6245
5976
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6246
5977
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6309,15 +6040,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
6309
6040
  size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
6310
6041
  size_t local_work_size[] = {256, 1, 1};
6311
6042
 
6312
- #ifdef GGML_OPENCL_PROFILING
6313
- cl_event evt;
6314
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6315
-
6316
- g_profiling_info.emplace_back();
6317
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6318
- #else
6319
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6320
- #endif
6043
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6321
6044
  }
6322
6045
 
6323
6046
  static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6332,7 +6055,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
6332
6055
  GGML_ASSERT(ggml_is_contiguous(src0));
6333
6056
 
6334
6057
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6335
- cl_command_queue queue = backend_ctx->queue;
6336
6058
 
6337
6059
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6338
6060
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6364,15 +6086,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
6364
6086
  size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
6365
6087
  size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
6366
6088
 
6367
- #ifdef GGML_OPENCL_PROFILING
6368
- cl_event evt;
6369
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6370
-
6371
- g_profiling_info.emplace_back();
6372
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6373
- #else
6374
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6375
- #endif
6089
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6376
6090
  }
6377
6091
 
6378
6092
  static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6386,7 +6100,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
6386
6100
  GGML_ASSERT(ggml_is_contiguous(src0));
6387
6101
 
6388
6102
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6389
- cl_command_queue queue = backend_ctx->queue;
6390
6103
 
6391
6104
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6392
6105
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6427,15 +6140,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
6427
6140
  size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
6428
6141
  size_t local_work_size[] = {(size_t)64, 1, 1};
6429
6142
 
6430
- #ifdef GGML_OPENCL_PROFILING
6431
- cl_event evt;
6432
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6433
-
6434
- g_profiling_info.emplace_back();
6435
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6436
- #else
6437
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6438
- #endif
6143
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6439
6144
  }
6440
6145
 
6441
6146
  //------------------------------------------------------------------------------