@novastera-oss/llamarn 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  10. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  11. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  13. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  15. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  18. package/cpp/LlamaCppModel.cpp +56 -22
  19. package/cpp/build-info.cpp +2 -2
  20. package/cpp/llama.cpp/CMakeLists.txt +1 -2
  21. package/cpp/llama.cpp/README.md +4 -5
  22. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  23. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  24. package/cpp/llama.cpp/common/arg.cpp +24 -0
  25. package/cpp/llama.cpp/common/chat.cpp +37 -20
  26. package/cpp/llama.cpp/common/chat.h +2 -0
  27. package/cpp/llama.cpp/common/common.cpp +3 -0
  28. package/cpp/llama.cpp/common/common.h +5 -0
  29. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  30. package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
  31. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
  32. package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
  33. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  34. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  35. package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
  36. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  95. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  96. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  99. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
  100. package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
  101. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
  103. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
  104. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
  105. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
  108. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  112. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  114. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  115. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  116. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  117. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  118. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  133. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  134. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  135. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  136. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  137. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
  138. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  139. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  141. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  142. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  144. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  145. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  146. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
  147. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
  149. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  150. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  151. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  152. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  153. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  154. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  161. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  162. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  164. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  166. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  167. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  168. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  169. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  170. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  172. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
  173. package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
  174. package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
  175. package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
  176. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
  177. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
  178. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
  179. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  180. package/cpp/llama.cpp/include/llama.h +8 -43
  181. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  182. package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
  183. package/cpp/llama.cpp/src/llama-arch.h +36 -1
  184. package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
  185. package/cpp/llama.cpp/src/llama-batch.h +105 -70
  186. package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
  187. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  188. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  189. package/cpp/llama.cpp/src/llama-context.h +13 -13
  190. package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
  191. package/cpp/llama.cpp/src/llama-graph.h +78 -79
  192. package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
  193. package/cpp/llama.cpp/src/llama-hparams.h +11 -0
  194. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
  195. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  196. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
  197. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
  198. package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
  199. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
  200. package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
  201. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
  202. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  203. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  204. package/cpp/llama.cpp/src/llama-memory.h +21 -22
  205. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  206. package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
  207. package/cpp/llama.cpp/src/llama-model.h +40 -0
  208. package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
  209. package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
  210. package/cpp/llama.cpp/src/llama-vocab.h +42 -0
  211. package/cpp/rn-utils.h +3 -0
  212. package/ios/include/chat.h +2 -0
  213. package/ios/include/common.h +5 -0
  214. package/ios/include/llama.h +8 -43
  215. package/ios/libs/llama.xcframework/Info.plist +19 -19
  216. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  217. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
  218. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  219. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  220. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
  221. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
  222. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  223. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  224. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  225. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
  226. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  227. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  228. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  229. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  230. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  231. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  232. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  233. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
  234. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  235. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  236. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
  237. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
  238. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  239. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  240. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
  241. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
  242. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  243. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  244. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  245. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
  246. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
  247. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  248. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  249. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  250. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
  251. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  252. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  253. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
  254. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
  255. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  256. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  257. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  258. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
  259. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  260. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  261. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  262. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  263. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  264. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  265. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
  266. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  267. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  268. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
  269. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
  270. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  271. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  272. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
  273. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
  274. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  275. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  276. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  277. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  278. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  279. package/package.json +1 -1
  280. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  281. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  282. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  283. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  284. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  285. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  286. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  287. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  288. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  289. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  290. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  291. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  292. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  293. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  294. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  295. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  296. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  297. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  298. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  299. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  300. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  301. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  302. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  303. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  304. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  305. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  306. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  307. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  308. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  309. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  310. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  311. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  312. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  313. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  314. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  315. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  316. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  317. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  318. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  319. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
231
231
  return { type, major, minor, patch };
232
232
  }
233
233
 
234
+ // Profiling
235
+ struct ProfilingInfo {
236
+ std::string op_name;
237
+ std::string kernel_name;
238
+
239
+ cl_kernel kernel;
240
+ cl_event evt;
241
+
242
+ cl_ulong cmd_queued;
243
+ cl_ulong cmd_submit;
244
+ cl_ulong cmd_start;
245
+ cl_ulong cmd_end;
246
+ cl_ulong overhead_start;
247
+ cl_ulong overhead_end;
248
+ // For the times below, see spec for clGetEventProfilingInfo
249
+ // The time kernel spent in cmd queue - SUBMIT - QUEUED
250
+ cl_ulong cmd_queued_duration_ns;
251
+ // The time kernel spent for submission - START - SUBMIT
252
+ cl_ulong cmd_submit_duration_ns;
253
+ // Kernel execution time in nanoseconds - END - START
254
+ cl_ulong cmd_duration_ns;
255
+ // The time for the kernel to complete - COMPLETE - END
256
+ cl_ulong cmd_complete_duration_ns;
257
+ // Total time to finish the kernel - COMPELTE - QUEUED
258
+ cl_ulong cmd_total_duration_ns;
259
+ // Global and local work sizes.
260
+ size_t global_size[3];
261
+ size_t local_size[3];
262
+ // Op output size.
263
+ size_t output_size[4];
264
+ };
265
+
266
+ static void populateProfilingInfo(
267
+ ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
268
+ size_t global_size[3], size_t local_size[3],
269
+ const ggml_tensor * tensor) {
270
+ info.op_name = tensor->name;
271
+ info.kernel = kernel;
272
+ info.evt = evt;
273
+
274
+ // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
275
+ info.local_size[0] = 0;
276
+ info.local_size[1] = 0;
277
+ info.local_size[2] = 0;
278
+
279
+ info.global_size[0] = 0;
280
+ info.global_size[1] = 0;
281
+ info.global_size[2] = 0;
282
+
283
+ if (local_size) {
284
+ for (cl_uint i = 0; i < work_dim; ++i) {
285
+ info.local_size[i] = local_size[i];
286
+ }
287
+ }
288
+
289
+ for (cl_uint i = 0; i < work_dim; ++i) {
290
+ info.global_size[i] = global_size[i];
291
+ }
292
+
293
+ info.output_size[0] = tensor->ne[0];
294
+ info.output_size[1] = tensor->ne[1];
295
+ info.output_size[2] = tensor->ne[2];
296
+ info.output_size[3] = tensor->ne[3];
297
+ }
298
+
234
299
  struct ggml_backend_opencl_context;
235
300
 
236
301
  // backend device context
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
254
319
 
255
320
  // backend context
256
321
  struct ggml_backend_opencl_context {
322
+ int ref_count;
323
+
257
324
  cl_device_id device;
258
325
  std::string device_name;
259
326
 
@@ -284,6 +351,8 @@ struct ggml_backend_opencl_context {
284
351
  cl_program program_gemv_noshuffle_general;
285
352
  cl_program program_gemv_noshuffle;
286
353
  cl_program program_get_rows;
354
+ cl_program program_set_rows;
355
+ cl_program program_glu;
287
356
  cl_program program_im2col_f16;
288
357
  cl_program program_im2col_f32;
289
358
  cl_program program_mul_mat_Ab_Bi_8x4;
@@ -299,6 +368,7 @@ struct ggml_backend_opencl_context {
299
368
  cl_program program_mul_mv_f16_f32;
300
369
  cl_program program_mul_mv_f32_f32;
301
370
  cl_program program_mul;
371
+ cl_program program_mul_mat_f16_f32_tiled;
302
372
  cl_program program_div;
303
373
  cl_program program_sub;
304
374
  cl_program program_norm;
@@ -330,10 +400,13 @@ struct ggml_backend_opencl_context {
330
400
  cl_kernel kernel_scale;
331
401
  cl_kernel kernel_silu, kernel_silu_4;
332
402
  cl_kernel kernel_gelu, kernel_gelu_4;
403
+ cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
333
404
  cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
334
405
  cl_kernel kernel_relu;
335
406
  cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
336
407
  cl_kernel kernel_clamp;
408
+ cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick,
409
+ kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
337
410
  cl_kernel kernel_norm;
338
411
  cl_kernel kernel_rms_norm;
339
412
  cl_kernel kernel_group_norm;
@@ -341,6 +414,7 @@ struct ggml_backend_opencl_context {
341
414
  cl_kernel kernel_soft_max, kernel_soft_max_4;
342
415
  cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
343
416
  cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
417
+ cl_kernel kernel_set_rows_f32, kernel_set_rows_f16;
344
418
  cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
345
419
  cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
346
420
  cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
@@ -349,6 +423,7 @@ struct ggml_backend_opencl_context {
349
423
  cl_kernel kernel_mul_mat_f16_f32_1row;
350
424
  cl_kernel kernel_mul_mat_f16_f32;
351
425
  cl_kernel kernel_mul_mat_f16_f32_l4;
426
+ cl_kernel kernel_mul_mat_f16_f32_tiled;
352
427
  cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
353
428
  cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
354
429
  cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
@@ -369,6 +444,118 @@ struct ggml_backend_opencl_context {
369
444
  cl_kernel kernel_timestep_embedding;
370
445
  cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
371
446
 
447
+ std::vector<ProfilingInfo> profiling_info;
448
+
449
+ void write_profiling_info() {
450
+ FILE * fperf = fopen("cl_profiling.csv", "w");
451
+ if (!fperf) {
452
+ GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
453
+ return;
454
+ }
455
+
456
+ // Populate profiling info
457
+ for (ProfilingInfo & info : profiling_info) {
458
+ cl_ulong cmd_queued;
459
+ cl_ulong cmd_submit;
460
+ cl_ulong cmd_start;
461
+ cl_ulong cmd_end;
462
+ cl_ulong cmd_complete;
463
+
464
+ CL_CHECK(clWaitForEvents(1, &info.evt));
465
+ CL_CHECK(clGetEventProfilingInfo(
466
+ info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
467
+ CL_CHECK(clGetEventProfilingInfo(
468
+ info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
469
+ CL_CHECK(clGetEventProfilingInfo(
470
+ info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
471
+ CL_CHECK(clGetEventProfilingInfo(
472
+ info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
473
+ CL_CHECK(clGetEventProfilingInfo(
474
+ info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
475
+ CL_CHECK(clReleaseEvent(info.evt));
476
+
477
+ char kernel_name[512];
478
+ CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
479
+ sizeof(kernel_name), kernel_name, NULL));
480
+ info.kernel_name = kernel_name;
481
+
482
+ info.cmd_queued = cmd_queued;
483
+ info.cmd_submit = cmd_submit;
484
+ info.cmd_start = cmd_start;
485
+ info.cmd_end = cmd_end;
486
+
487
+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
488
+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
489
+ info.cmd_duration_ns = cmd_end - cmd_start;
490
+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
491
+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
492
+ }
493
+
494
+ // Dump a csv
495
+ float total_kernel_time = 0;
496
+ fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
497
+ for (const ProfilingInfo & info : profiling_info) {
498
+ total_kernel_time += info.cmd_duration_ns/1.e6f;
499
+ fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
500
+ info.op_name.c_str(), info.kernel_name.c_str(),
501
+ info.cmd_queued_duration_ns/1.e6f,
502
+ info.cmd_submit_duration_ns/1.e6f,
503
+ info.cmd_duration_ns/1.e6f,
504
+ info.cmd_complete_duration_ns/1.e6f,
505
+ info.cmd_total_duration_ns/1.e6f,
506
+ info.global_size[0], info.global_size[1], info.global_size[2],
507
+ info.local_size[0], info.local_size[1], info.local_size[2],
508
+ info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
509
+ }
510
+ fclose(fperf);
511
+
512
+ GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
513
+
514
+ // Dump a simple chrome trace
515
+ FILE* ftrace = fopen("cl_trace.json", "w");
516
+ if (!ftrace) {
517
+ GGML_LOG_ERROR("Failed to open cl_trace.json\n");
518
+ return;
519
+ }
520
+
521
+ fprintf(ftrace, "[\n");
522
+ for (const ProfilingInfo & info : profiling_info) {
523
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
524
+ info.kernel_name.c_str(), info.cmd_queued/1000);
525
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
526
+ info.kernel_name.c_str(), info.cmd_submit/1000);
527
+
528
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
529
+ info.kernel_name.c_str(), info.cmd_start/1000);
530
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
531
+ info.kernel_name.c_str(), info.cmd_end/1000);
532
+ }
533
+ fclose(ftrace);
534
+ }
535
+
536
+ size_t get_kernel_workgroup_size(cl_kernel kernel) const {
537
+ size_t workgroup_size = 0;
538
+ size_t ret_size = 0;
539
+ CL_CHECK(
540
+ clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
541
+ sizeof(size_t), &workgroup_size, &ret_size));
542
+ GGML_ASSERT(sizeof(size_t) == ret_size);
543
+ return workgroup_size;
544
+ }
545
+
546
+ void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
547
+ #ifdef GGML_OPENCL_PROFILING
548
+ cl_event evt;
549
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
550
+
551
+ profiling_info.emplace_back();
552
+ populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
553
+ #else
554
+ GGML_UNUSED(tensor);
555
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
556
+ #endif
557
+ }
558
+
372
559
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
373
560
  // Transpose kernels
374
561
  cl_program program_transpose;
@@ -395,46 +582,19 @@ struct ggml_backend_opencl_context {
395
582
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
396
583
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
397
584
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
398
- };
399
-
400
- // All registered devices with a default device in the front.
401
- static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
402
585
 
403
- // Profiling
586
+ void free() {
587
+ ref_count--;
588
+ if (ref_count == 0) {
404
589
  #ifdef GGML_OPENCL_PROFILING
405
- struct ProfilingInfo {
406
- std::string op_name;
407
- std::string kernel_name;
408
-
409
- cl_kernel kernel;
410
- cl_event evt;
411
-
412
- cl_ulong cmd_queued;
413
- cl_ulong cmd_submit;
414
- cl_ulong cmd_start;
415
- cl_ulong cmd_end;
416
- cl_ulong overhead_start;
417
- cl_ulong overhead_end;
418
- // For the times below, see spec for clGetEventProfilingInfo
419
- // The time kernel spent in cmd queue - SUBMIT - QUEUED
420
- cl_ulong cmd_queued_duration_ns;
421
- // The time kernel spent for submission - START - SUBMIT
422
- cl_ulong cmd_submit_duration_ns;
423
- // Kernel execution time in nanoseconds - END - START
424
- cl_ulong cmd_duration_ns;
425
- // The time for the kernel to complete - COMPLETE - END
426
- cl_ulong cmd_complete_duration_ns;
427
- // Total time to finish the kernel - COMPELTE - QUEUED
428
- cl_ulong cmd_total_duration_ns;
429
- // Global and local work sizes.
430
- size_t global_size[3];
431
- size_t local_size[3];
432
- // Op output size.
433
- size_t output_size[4];
590
+ write_profiling_info();
591
+ #endif
592
+ }
593
+ }
434
594
  };
435
595
 
436
- std::vector<ProfilingInfo> g_profiling_info;
437
- #endif
596
+ // All registered devices with a default device in the front.
597
+ static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
438
598
 
439
599
  inline std::string read_file(const std::string &path) {
440
600
  std::ifstream ifs(path);
@@ -591,11 +751,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
591
751
 
592
752
  CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
593
753
  CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
754
+ CL_CHECK((backend_ctx->kernel_gelu_erf = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf", &err), err));
755
+ CL_CHECK((backend_ctx->kernel_gelu_erf_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf_4", &err), err));
594
756
  CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
595
757
  CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
596
758
  GGML_LOG_CONT(".");
597
759
  }
598
760
 
761
+ // glu
762
+ {
763
+ #ifdef GGML_OPENCL_EMBED_KERNELS
764
+ const std::string kernel_src {
765
+ #include "glu.cl.h"
766
+ };
767
+ #else
768
+ const std::string kernel_src = read_file("glu.cl");
769
+ #endif
770
+ backend_ctx->program_glu =
771
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
772
+
773
+ CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
774
+ CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
775
+ CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
776
+ CL_CHECK((backend_ctx->kernel_geglu_erf = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf", &err), err));
777
+ CL_CHECK((backend_ctx->kernel_geglu_quick = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick", &err), err));
778
+ CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
779
+ CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
780
+ CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
781
+ CL_CHECK((backend_ctx->kernel_geglu_erf_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf_f16", &err), err));
782
+ CL_CHECK((backend_ctx->kernel_geglu_quick_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick_f16", &err), err));
783
+ GGML_LOG_CONT(".");
784
+ }
785
+
599
786
  // get_rows
600
787
  {
601
788
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -830,6 +1017,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
830
1017
  GGML_LOG_CONT(".");
831
1018
  }
832
1019
 
1020
+ // mul_mat_f16_f32_tiled
1021
+ {
1022
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1023
+ const std::string kernel_src {
1024
+ #include "mul_mat_f16_f32.cl.h"
1025
+ };
1026
+ #else
1027
+ const std::string kernel_src = read_file("mul_mat_f16_f32.cl");
1028
+ #endif
1029
+ backend_ctx->program_mul_mat_f16_f32_tiled =
1030
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1031
+
1032
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_tiled, "mul_mat_f16_f32", &err), err));
1033
+ GGML_LOG_CONT(".");
1034
+ }
1035
+
833
1036
  // mul
834
1037
  {
835
1038
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1258,6 +1461,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1258
1461
  }
1259
1462
  }
1260
1463
 
1464
+ // set_rows
1465
+ {
1466
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1467
+ const std::string kernel_src {
1468
+ #include "set_rows.cl.h"
1469
+ };
1470
+ #else
1471
+ const std::string kernel_src = read_file("set_rows.cl");
1472
+ #endif
1473
+ backend_ctx->program_set_rows =
1474
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1475
+
1476
+ CL_CHECK((backend_ctx->kernel_set_rows_f32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32", &err), err));
1477
+ CL_CHECK((backend_ctx->kernel_set_rows_f16 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16", &err), err));
1478
+ GGML_LOG_CONT(".");
1479
+ }
1480
+
1261
1481
  // mul_mv_id_q4_0_f32_8x_flat
1262
1482
  {
1263
1483
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1669,6 +1889,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1669
1889
  backend_ctx->device = dev_ctx->device;
1670
1890
  backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1671
1891
 
1892
+ // ref_count get increased in ggml_backend_opencl_device_init
1893
+ // This function is also used to retrieve backend context, so we don't want
1894
+ // to increase ref_count for each call. We only want to increase ref_count
1895
+ // when the associated device is initialized
1896
+ backend_ctx->ref_count = 0;
1897
+
1672
1898
  if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
1673
1899
  strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
1674
1900
  strstr(dev_ctx->device_version.c_str(), "Adreno")) {
@@ -1841,93 +2067,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1841
2067
  return dev_ctx->backend_ctx;
1842
2068
  }
1843
2069
 
1844
- static void ggml_cl2_free(void) {
1845
- #ifdef GGML_OPENCL_PROFILING
1846
- FILE * fperf = fopen("cl_profiling.csv", "w");
1847
- if (!fperf) {
1848
- GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
1849
- return;
1850
- }
2070
+ static void ggml_cl2_free(ggml_backend_t backend) {
2071
+ ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
2072
+ ctx->free();
1851
2073
 
1852
- // Populate profiling info
1853
- for (ProfilingInfo & info : g_profiling_info) {
1854
- cl_ulong cmd_queued;
1855
- cl_ulong cmd_submit;
1856
- cl_ulong cmd_start;
1857
- cl_ulong cmd_end;
1858
- cl_ulong cmd_complete;
1859
-
1860
- CL_CHECK(clWaitForEvents(1, &info.evt));
1861
- CL_CHECK(clGetEventProfilingInfo(
1862
- info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
1863
- CL_CHECK(clGetEventProfilingInfo(
1864
- info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
1865
- CL_CHECK(clGetEventProfilingInfo(
1866
- info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
1867
- CL_CHECK(clGetEventProfilingInfo(
1868
- info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
1869
- CL_CHECK(clGetEventProfilingInfo(
1870
- info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
1871
- CL_CHECK(clReleaseEvent(info.evt));
1872
-
1873
- char kernel_name[512];
1874
- CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
1875
- sizeof(kernel_name), kernel_name, NULL));
1876
- info.kernel_name = kernel_name;
1877
-
1878
- info.cmd_queued = cmd_queued;
1879
- info.cmd_submit = cmd_submit;
1880
- info.cmd_start = cmd_start;
1881
- info.cmd_end = cmd_end;
1882
-
1883
- info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
1884
- info.cmd_submit_duration_ns = cmd_start - cmd_submit;
1885
- info.cmd_duration_ns = cmd_end - cmd_start;
1886
- info.cmd_complete_duration_ns = cmd_complete - cmd_end;
1887
- info.cmd_total_duration_ns = cmd_complete - cmd_queued;
1888
- }
1889
-
1890
- // Dump a csv
1891
- float total_kernel_time = 0;
1892
- fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
1893
- for (const ProfilingInfo & info : g_profiling_info) {
1894
- total_kernel_time += info.cmd_duration_ns/1.e6f;
1895
- fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
1896
- info.op_name.c_str(), info.kernel_name.c_str(),
1897
- info.cmd_queued_duration_ns/1.e6f,
1898
- info.cmd_submit_duration_ns/1.e6f,
1899
- info.cmd_duration_ns/1.e6f,
1900
- info.cmd_complete_duration_ns/1.e6f,
1901
- info.cmd_total_duration_ns/1.e6f,
1902
- info.global_size[0], info.global_size[1], info.global_size[2],
1903
- info.local_size[0], info.local_size[1], info.local_size[2],
1904
- info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
1905
- }
1906
- fclose(fperf);
1907
-
1908
- GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
1909
-
1910
- // Dump a simple chrome trace
1911
- FILE* ftrace = fopen("cl_trace.json", "w");
1912
- if (!ftrace) {
1913
- GGML_LOG_ERROR("Failed to open cl_trace.json\n");
1914
- return;
2074
+ // The CL context is shared by all backends, release it if all backends have been released
2075
+ bool should_release_opencl = true;
2076
+ for (auto device : g_ggml_backend_opencl_devices) {
2077
+ ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
2078
+ if (ctx_dev->backend_ctx->ref_count > 0) {
2079
+ should_release_opencl = false;
2080
+ }
1915
2081
  }
1916
2082
 
1917
- fprintf(ftrace, "[\n");
1918
- for (const ProfilingInfo & info : g_profiling_info) {
1919
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
1920
- info.kernel_name.c_str(), info.cmd_queued/1000);
1921
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
1922
- info.kernel_name.c_str(), info.cmd_submit/1000);
1923
-
1924
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1925
- info.kernel_name.c_str(), info.cmd_start/1000);
1926
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1927
- info.kernel_name.c_str(), info.cmd_end/1000);
2083
+ if (should_release_opencl) {
2084
+ CL_CHECK(clReleaseContext(ctx->context));
1928
2085
  }
1929
- fclose(ftrace);
1930
- #endif
1931
2086
  }
1932
2087
 
1933
2088
  //------------------------------------------------------------------------------
@@ -2011,9 +2166,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
2011
2166
  }
2012
2167
 
2013
2168
  static void ggml_backend_opencl_free(ggml_backend_t backend) {
2014
- ggml_cl2_free();
2015
-
2016
- GGML_UNUSED(backend);
2169
+ ggml_cl2_free(backend);
2017
2170
  }
2018
2171
 
2019
2172
  static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -2088,7 +2241,7 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
2088
2241
  // dependencies.
2089
2242
  sync_with_other_backends(backend);
2090
2243
 
2091
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2244
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2092
2245
  continue;
2093
2246
  }
2094
2247
 
@@ -2123,6 +2276,21 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2123
2276
  default:
2124
2277
  return false;
2125
2278
  }
2279
+ case GGML_OP_SET_ROWS:
2280
+ {
2281
+ // TODO: add support
2282
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14274
2283
+ if (op->src[0]->type != GGML_TYPE_F32) {
2284
+ return false;
2285
+ }
2286
+ switch (op->type) {
2287
+ case GGML_TYPE_F16:
2288
+ case GGML_TYPE_F32:
2289
+ return true;
2290
+ default:
2291
+ return false;
2292
+ }
2293
+ }
2126
2294
  case GGML_OP_CPY:
2127
2295
  case GGML_OP_DUP:
2128
2296
  case GGML_OP_CONT:
@@ -2157,6 +2325,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2157
2325
  case GGML_UNARY_OP_GELU:
2158
2326
  case GGML_UNARY_OP_SILU:
2159
2327
  case GGML_UNARY_OP_RELU:
2328
+ case GGML_UNARY_OP_GELU_ERF:
2160
2329
  case GGML_UNARY_OP_GELU_QUICK:
2161
2330
  return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
2162
2331
  case GGML_UNARY_OP_SIGMOID:
@@ -2167,6 +2336,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2167
2336
  default:
2168
2337
  return false;
2169
2338
  }
2339
+ case GGML_OP_GLU:
2340
+ switch (ggml_get_glu_op(op)) {
2341
+ case GGML_GLU_OP_GEGLU:
2342
+ case GGML_GLU_OP_REGLU:
2343
+ case GGML_GLU_OP_SWIGLU:
2344
+ case GGML_GLU_OP_GEGLU_ERF:
2345
+ case GGML_GLU_OP_GEGLU_QUICK:
2346
+ return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
2347
+ default:
2348
+ return false;
2349
+ }
2170
2350
  case GGML_OP_CLAMP:
2171
2351
  return op->src[0]->type == GGML_TYPE_F32;
2172
2352
  case GGML_OP_SOFT_MAX:
@@ -2899,6 +3079,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
2899
3079
 
2900
3080
  static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
2901
3081
  ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
3082
+ // Getting a new reference to the backend, increase ref_count
3083
+ backend_ctx->ref_count++;
2902
3084
 
2903
3085
  ggml_backend_t backend = new ggml_backend {
2904
3086
  /* .guid = */ ggml_backend_opencl_guid(),
@@ -3089,7 +3271,7 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
3089
3271
 
3090
3272
  // Open file and dump.
3091
3273
  char fname[512];
3092
- sprintf(fname, "./tensor-dumps/%s.txt", tensor->name);
3274
+ snprintf(fname, sizeof(fname), "./tensor-dumps/%s.txt", tensor->name);
3093
3275
  FILE * f = fopen(fname, "w");
3094
3276
  if (!f) {
3095
3277
  printf("Failed to open %s\n", fname);
@@ -3159,31 +3341,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
3159
3341
  #define dump_tensor(tensor)
3160
3342
  #endif
3161
3343
 
3162
- //------------------------------------------------------------------------------
3163
- // Profiling utility
3164
- //------------------------------------------------------------------------------
3165
- #ifdef GGML_OPENCL_PROFILING
3166
- static void populateProfilingInfo(
3167
- ProfilingInfo& info, cl_event evt, cl_kernel kernel,
3168
- size_t global_size[3], size_t local_size[3],
3169
- const ggml_tensor * tensor) {
3170
- info.op_name = tensor->name;
3171
- info.kernel = kernel;
3172
- info.evt = evt;
3173
-
3174
- info.local_size[0] = local_size[0];
3175
- info.local_size[1] = local_size[1];
3176
- info.local_size[2] = local_size[2];
3177
- info.global_size[0] = global_size[0];
3178
- info.global_size[1] = global_size[1];
3179
- info.global_size[2] = global_size[2];
3180
- info.output_size[0] = tensor->ne[0];
3181
- info.output_size[1] = tensor->ne[1];
3182
- info.output_size[2] = tensor->ne[2];
3183
- info.output_size[3] = tensor->ne[3];
3184
- }
3185
- #endif
3186
-
3187
3344
  //------------------------------------------------------------------------------
3188
3345
  // Ops
3189
3346
  //------------------------------------------------------------------------------
@@ -3227,7 +3384,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3227
3384
  const cl_ulong nb2 = dst ? dst->nb[2] : 0;
3228
3385
 
3229
3386
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3230
- cl_command_queue queue = backend_ctx->queue;
3231
3387
 
3232
3388
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3233
3389
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3271,15 +3427,112 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3271
3427
  size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
3272
3428
  size_t local_work_size[] = {1, 1, 1};
3273
3429
 
3274
- #ifdef GGML_OPENCL_PROFILING
3275
- cl_event evt;
3276
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3430
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3431
+ }
3277
3432
 
3278
- g_profiling_info.emplace_back();
3279
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3280
- #else
3281
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3282
- #endif
3433
+ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3434
+ GGML_ASSERT(src0);
3435
+ GGML_ASSERT(src0->extra);
3436
+ GGML_ASSERT(src1);
3437
+ GGML_ASSERT(src1->extra);
3438
+ GGML_ASSERT(dst);
3439
+ GGML_ASSERT(dst->extra);
3440
+
3441
+ // ne0 = ne00
3442
+ // ne2 = ne02
3443
+ // ne3 = ne03
3444
+
3445
+ const int ne01 = src0->ne[1];
3446
+ const int ne02 = src0->ne[2];
3447
+ const int ne03 = src0->ne[3];
3448
+
3449
+ const cl_ulong nb01 = src0->nb[1];
3450
+ const cl_ulong nb02 = src0->nb[2];
3451
+ const cl_ulong nb03 = src0->nb[3];
3452
+
3453
+ const int ne11 = src1->ne[1];
3454
+ const int ne12 = src1->ne[2];
3455
+
3456
+ const cl_ulong nb10 = src1->nb[0];
3457
+ const cl_ulong nb11 = src1->nb[1];
3458
+ const cl_ulong nb12 = src1->nb[2];
3459
+
3460
+ const int ne0 = dst->ne[0];
3461
+
3462
+ const cl_ulong nb1 = dst->nb[1];
3463
+ const cl_ulong nb2 = dst->nb[2];
3464
+ const cl_ulong nb3 = dst->nb[3];
3465
+
3466
+ const int nblk0 = ne0/ggml_blck_size(dst->type);
3467
+
3468
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3469
+
3470
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3471
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3472
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3473
+
3474
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3475
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
3476
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3477
+
3478
+ cl_kernel kernel;
3479
+
3480
+ switch (dst->type) {
3481
+ case GGML_TYPE_F32:
3482
+ kernel = backend_ctx->kernel_set_rows_f32;
3483
+ break;
3484
+ case GGML_TYPE_F16:
3485
+ kernel = backend_ctx->kernel_set_rows_f16;
3486
+ break;
3487
+ default:
3488
+ GGML_ABORT("not implemented");
3489
+ }
3490
+
3491
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3492
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3493
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3494
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3495
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3496
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3497
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
3498
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3499
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3500
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
3501
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
3502
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
3503
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
3504
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
3505
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
3506
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &nblk0));
3507
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1));
3508
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2));
3509
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3));
3510
+
3511
+ int nth0 = 64;
3512
+ if (backend_ctx->gpu_family == INTEL) {
3513
+ nth0 = 32;
3514
+ } else if (backend_ctx->gpu_family == ADRENO) {
3515
+ nth0 = 64;
3516
+ }
3517
+
3518
+ int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
3519
+ while (nth0 < nblk0 && nth0 < max_workgroup_size) {
3520
+ nth0 *= 2;
3521
+ }
3522
+
3523
+ int rows_per_workgroup = 1;
3524
+ if (nth0 > nblk0) {
3525
+ rows_per_workgroup = nth0 / nblk0;
3526
+ nth0 = nblk0;
3527
+ }
3528
+
3529
+ size_t global_work_size[] = {
3530
+ (size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0,
3531
+ (size_t)ne02*rows_per_workgroup,
3532
+ (size_t)ne03};
3533
+ size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
3534
+
3535
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3283
3536
  }
3284
3537
 
3285
3538
  static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3321,7 +3574,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3321
3574
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3322
3575
 
3323
3576
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3324
- cl_command_queue queue = backend_ctx->queue;
3325
3577
 
3326
3578
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3327
3579
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3396,29 +3648,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3396
3648
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3397
3649
  }
3398
3650
 
3399
- #ifdef GGML_OPENCL_PROFILING
3400
- cl_event evt;
3401
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3402
-
3403
- g_profiling_info.emplace_back();
3404
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3405
- #else
3406
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3407
- #endif
3651
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3408
3652
  } else {
3409
3653
  unsigned int nth = MIN(64, ne0);
3410
3654
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3411
3655
  size_t local_work_size[] = {nth, 1, 1};
3412
3656
 
3413
- #ifdef GGML_OPENCL_PROFILING
3414
- cl_event evt;
3415
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3416
-
3417
- g_profiling_info.emplace_back();
3418
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3419
- #else
3420
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3421
- #endif
3657
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3422
3658
  }
3423
3659
  }
3424
3660
 
@@ -3461,7 +3697,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3461
3697
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3462
3698
 
3463
3699
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3464
- cl_command_queue queue = backend_ctx->queue;
3465
3700
 
3466
3701
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3467
3702
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3536,29 +3771,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3536
3771
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3537
3772
  }
3538
3773
 
3539
- #ifdef GGML_OPENCL_PROFILING
3540
- cl_event evt;
3541
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3542
-
3543
- g_profiling_info.emplace_back();
3544
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3545
- #else
3546
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3547
- #endif
3774
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3548
3775
  } else {
3549
3776
  unsigned int nth = MIN(64, ne0);
3550
3777
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3551
3778
  size_t local_work_size[] = {nth, 1, 1};
3552
3779
 
3553
- #ifdef GGML_OPENCL_PROFILING
3554
- cl_event evt;
3555
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3556
-
3557
- g_profiling_info.emplace_back();
3558
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3559
- #else
3560
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3561
- #endif
3780
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3562
3781
  }
3563
3782
  }
3564
3783
 
@@ -3598,7 +3817,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3598
3817
  const cl_ulong nb3 = dst->nb[3];
3599
3818
 
3600
3819
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3601
- cl_command_queue queue = backend_ctx->queue;
3602
3820
 
3603
3821
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3604
3822
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3661,29 +3879,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3661
3879
  size_t global_work_size[] = {(size_t)n, 1, 1};
3662
3880
  size_t local_work_size[] = {64, 1, 1};
3663
3881
 
3664
- #ifdef GGML_OPENCL_PROFILING
3665
- cl_event evt;
3666
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3667
-
3668
- g_profiling_info.emplace_back();
3669
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3670
- #else
3671
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3672
- #endif
3882
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3673
3883
  } else {
3674
3884
  unsigned int nth = MIN(64, ne0);
3675
3885
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3676
3886
  size_t local_work_size[] = {nth, 1, 1};
3677
3887
 
3678
- #ifdef GGML_OPENCL_PROFILING
3679
- cl_event evt;
3680
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3681
-
3682
- g_profiling_info.emplace_back();
3683
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3684
- #else
3685
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3686
- #endif
3888
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3687
3889
  }
3688
3890
  }
3689
3891
 
@@ -3723,7 +3925,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
3723
3925
  const cl_ulong nb3 = dst->nb[3];
3724
3926
 
3725
3927
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3726
- cl_command_queue queue = backend_ctx->queue;
3727
3928
 
3728
3929
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3729
3930
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3786,29 +3987,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
3786
3987
  size_t global_work_size[] = {(size_t)n, 1, 1};
3787
3988
  size_t local_work_size[] = {64, 1, 1};
3788
3989
 
3789
- #ifdef GGML_OPENCL_PROFILING
3790
- cl_event evt;
3791
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3792
-
3793
- g_profiling_info.emplace_back();
3794
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3795
- #else
3796
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3797
- #endif
3990
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3798
3991
  } else {
3799
3992
  unsigned int nth = MIN(64, ne0);
3800
3993
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3801
3994
  size_t local_work_size[] = {nth, 1, 1};
3802
3995
 
3803
- #ifdef GGML_OPENCL_PROFILING
3804
- cl_event evt;
3805
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3806
-
3807
- g_profiling_info.emplace_back();
3808
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3809
- #else
3810
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3811
- #endif
3996
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3812
3997
  }
3813
3998
  }
3814
3999
 
@@ -3821,7 +4006,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3821
4006
  UNUSED(src1);
3822
4007
 
3823
4008
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3824
- cl_command_queue queue = backend_ctx->queue;
3825
4009
 
3826
4010
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3827
4011
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3848,15 +4032,45 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3848
4032
  size_t global_work_size[] = {(size_t)n, 1, 1};
3849
4033
  size_t local_work_size[] = {64, 1, 1};
3850
4034
 
3851
- #ifdef GGML_OPENCL_PROFILING
3852
- cl_event evt;
3853
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
4035
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4036
+ }
3854
4037
 
3855
- g_profiling_info.emplace_back();
3856
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3857
- #else
3858
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3859
- #endif
4038
+ static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4039
+ GGML_ASSERT(src0);
4040
+ GGML_ASSERT(src0->extra);
4041
+ GGML_ASSERT(dst);
4042
+ GGML_ASSERT(dst->extra);
4043
+
4044
+ UNUSED(src1);
4045
+
4046
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4047
+
4048
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4049
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4050
+
4051
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
4052
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
4053
+
4054
+ cl_kernel kernel;
4055
+
4056
+ int n = ggml_nelements(dst);
4057
+
4058
+ if (n % 4 == 0) {
4059
+ kernel = backend_ctx->kernel_gelu_erf_4;
4060
+ n /= 4;
4061
+ } else {
4062
+ kernel = backend_ctx->kernel_gelu_erf;
4063
+ }
4064
+
4065
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
4066
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
4067
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4068
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
4069
+
4070
+ size_t global_work_size[] = {(size_t)n, 1, 1};
4071
+ size_t local_work_size[] = {64, 1, 1};
4072
+
4073
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3860
4074
  }
3861
4075
 
3862
4076
  static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3868,7 +4082,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
3868
4082
  UNUSED(src1);
3869
4083
 
3870
4084
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3871
- cl_command_queue queue = backend_ctx->queue;
3872
4085
 
3873
4086
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3874
4087
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3895,15 +4108,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
3895
4108
  size_t global_work_size[] = {(size_t)n, 1, 1};
3896
4109
  size_t local_work_size[] = {64, 1, 1};
3897
4110
 
3898
- #ifdef GGML_OPENCL_PROFILING
3899
- cl_event evt;
3900
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3901
-
3902
- g_profiling_info.emplace_back();
3903
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3904
- #else
3905
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3906
- #endif
4111
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3907
4112
  }
3908
4113
 
3909
4114
  static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3915,7 +4120,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3915
4120
  UNUSED(src1);
3916
4121
 
3917
4122
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3918
- cl_command_queue queue = backend_ctx->queue;
3919
4123
 
3920
4124
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3921
4125
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3947,15 +4151,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3947
4151
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3948
4152
  }
3949
4153
 
3950
- #ifdef GGML_OPENCL_PROFILING
3951
- cl_event evt;
3952
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3953
-
3954
- g_profiling_info.emplace_back();
3955
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3956
- #else
3957
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3958
- #endif
4154
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3959
4155
  }
3960
4156
 
3961
4157
  static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3967,7 +4163,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3967
4163
  UNUSED(src1);
3968
4164
 
3969
4165
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3970
- cl_command_queue queue = backend_ctx->queue;
3971
4166
 
3972
4167
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3973
4168
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3992,15 +4187,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3992
4187
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3993
4188
  }
3994
4189
 
3995
- #ifdef GGML_OPENCL_PROFILING
3996
- cl_event evt;
3997
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3998
-
3999
- g_profiling_info.emplace_back();
4000
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4001
- #else
4002
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4003
- #endif
4190
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4004
4191
  }
4005
4192
 
4006
4193
  static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4012,7 +4199,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
4012
4199
  UNUSED(src1);
4013
4200
 
4014
4201
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4015
- cl_command_queue queue = backend_ctx->queue;
4016
4202
 
4017
4203
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4018
4204
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4044,15 +4230,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
4044
4230
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4045
4231
  }
4046
4232
 
4047
- #ifdef GGML_OPENCL_PROFILING
4048
- cl_event evt;
4049
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4050
-
4051
- g_profiling_info.emplace_back();
4052
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4053
- #else
4054
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4055
- #endif
4233
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4056
4234
  }
4057
4235
 
4058
4236
  static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4064,7 +4242,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
4064
4242
  UNUSED(src1);
4065
4243
 
4066
4244
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4067
- cl_command_queue queue = backend_ctx->queue;
4068
4245
 
4069
4246
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4070
4247
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4096,15 +4273,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
4096
4273
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4097
4274
  }
4098
4275
 
4099
- #ifdef GGML_OPENCL_PROFILING
4100
- cl_event evt;
4101
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4102
-
4103
- g_profiling_info.emplace_back();
4104
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4105
- #else
4106
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4107
- #endif
4276
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4108
4277
  }
4109
4278
 
4110
4279
  static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4116,7 +4285,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
4116
4285
  UNUSED(src1);
4117
4286
 
4118
4287
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4119
- cl_command_queue queue = backend_ctx->queue;
4120
4288
 
4121
4289
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4122
4290
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4157,15 +4325,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
4157
4325
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
4158
4326
  size_t local_work_size[] = {(size_t)nth, 1, 1};
4159
4327
 
4160
- #ifdef GGML_OPENCL_PROFILING
4161
- cl_event evt;
4162
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4163
-
4164
- g_profiling_info.emplace_back();
4165
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4166
- #else
4167
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4168
- #endif
4328
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4169
4329
  }
4170
4330
 
4171
4331
  static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4177,7 +4337,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
4177
4337
  UNUSED(src1);
4178
4338
 
4179
4339
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4180
- cl_command_queue queue = backend_ctx->queue;
4181
4340
 
4182
4341
  //ggml_backend_opencl_device_context * dev_ctx =
4183
4342
  // (ggml_backend_opencl_device_context *)backend->device->context;
@@ -4241,15 +4400,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
4241
4400
  // This is local memory - the size depends on subgroup size.
4242
4401
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
4243
4402
 
4244
- #ifdef GGML_OPENCL_PROFILING
4245
- cl_event evt;
4246
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4247
-
4248
- g_profiling_info.emplace_back();
4249
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4250
- #else
4251
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4252
- #endif
4403
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4253
4404
  }
4254
4405
 
4255
4406
  static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4261,7 +4412,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
4261
4412
  UNUSED(src1);
4262
4413
 
4263
4414
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4264
- cl_command_queue queue = backend_ctx->queue;
4265
4415
 
4266
4416
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4267
4417
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4300,15 +4450,7 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
4300
4450
  size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
4301
4451
  size_t local_work_size[] = {(size_t)sgs, 1, 1};
4302
4452
 
4303
- #ifdef GGML_OPENCL_PROFILING
4304
- cl_event evt;
4305
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4306
-
4307
- g_profiling_info.emplace_back();
4308
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4309
- #else
4310
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4311
- #endif
4453
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4312
4454
  }
4313
4455
 
4314
4456
  static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4320,7 +4462,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
4320
4462
  UNUSED(src1);
4321
4463
 
4322
4464
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4323
- cl_command_queue queue = backend_ctx->queue;
4324
4465
 
4325
4466
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4326
4467
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4397,16 +4538,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
4397
4538
  }
4398
4539
  if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
4399
4540
 
4400
-
4401
- #ifdef GGML_OPENCL_PROFILING
4402
- cl_event evt;
4403
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4404
-
4405
- g_profiling_info.emplace_back();
4406
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
4407
- #else
4408
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4409
- #endif
4541
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4410
4542
  }
4411
4543
 
4412
4544
  static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
@@ -4419,7 +4551,6 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
4419
4551
  UNUSED(src1_shape_def);
4420
4552
 
4421
4553
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4422
- cl_command_queue queue = backend_ctx->queue;
4423
4554
 
4424
4555
  if (backend_ctx->kernel_repeat == nullptr) {
4425
4556
  GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
@@ -4467,15 +4598,7 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
4467
4598
 
4468
4599
  size_t global_work_size[] = { gws0, gws1, gws2 };
4469
4600
 
4470
- #ifdef GGML_OPENCL_PROFILING
4471
- cl_event evt;
4472
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt));
4473
-
4474
- g_profiling_info.emplace_back();
4475
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst);
4476
- #else
4477
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
4478
- #endif
4601
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4479
4602
  }
4480
4603
 
4481
4604
  static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -4488,7 +4611,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
4488
4611
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
4489
4612
 
4490
4613
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4491
- cl_command_queue queue = backend_ctx->queue;
4492
4614
 
4493
4615
  if (backend_ctx->kernel_pad == nullptr) {
4494
4616
  GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
@@ -4533,15 +4655,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
4533
4655
  local_work_size_ptr = nullptr;
4534
4656
  }
4535
4657
 
4536
- #ifdef GGML_OPENCL_PROFILING
4537
- cl_event evt;
4538
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4539
-
4540
- g_profiling_info.emplace_back();
4541
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
4542
- #else
4543
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4544
- #endif
4658
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4545
4659
  }
4546
4660
 
4547
4661
  static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -4553,9 +4667,9 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4553
4667
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
4554
4668
 
4555
4669
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4556
- cl_command_queue queue = backend_ctx->queue;
4557
4670
 
4558
- const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
4671
+ const int mode_flags = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
4672
+ const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
4559
4673
  cl_kernel kernel = nullptr;
4560
4674
 
4561
4675
  if (mode == GGML_SCALE_MODE_NEAREST) {
@@ -4586,18 +4700,22 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4586
4700
  const cl_ulong nb02 = src0->nb[2];
4587
4701
  const cl_ulong nb03 = src0->nb[3];
4588
4702
 
4589
- const int ne00_src = src0->ne[0];
4590
- const int ne01_src = src0->ne[1];
4703
+ const int ne00 = src0->ne[0];
4704
+ const int ne01 = src0->ne[1];
4705
+ const int ne02 = src0->ne[2];
4706
+ const int ne03 = src0->ne[3];
4707
+
4708
+ const int ne0 = dst->ne[0];
4709
+ const int ne1 = dst->ne[1];
4710
+ const int ne2 = dst->ne[2];
4711
+ const int ne3 = dst->ne[3];
4591
4712
 
4592
- const int ne10_dst = dst->ne[0];
4593
- const int ne11_dst = dst->ne[1];
4594
- const int ne12_dst = dst->ne[2];
4595
- const int ne13_dst = dst->ne[3];
4713
+ float sf0 = (float)ne0 / ne00;
4714
+ float sf1 = (float)ne1 / ne01;
4715
+ float sf2 = (float)ne2 / ne02;
4716
+ float sf3 = (float)ne3 / ne03;
4596
4717
 
4597
- const float sf0 = (float)dst->ne[0] / src0->ne[0];
4598
- const float sf1 = (float)dst->ne[1] / src0->ne[1];
4599
- const float sf2 = (float)dst->ne[2] / src0->ne[2];
4600
- const float sf3 = (float)dst->ne[3] / src0->ne[3];
4718
+ float pixel_offset = 0.5f;
4601
4719
 
4602
4720
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4603
4721
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
@@ -4609,29 +4727,36 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4609
4727
  CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
4610
4728
 
4611
4729
  if (mode == GGML_SCALE_MODE_NEAREST) {
4612
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst));
4613
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst));
4614
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst));
4615
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst));
4730
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
4731
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne1));
4732
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne2));
4733
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne3));
4616
4734
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
4617
4735
  CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
4618
4736
  CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
4619
4737
  CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
4620
4738
  } else if (mode == GGML_SCALE_MODE_BILINEAR) {
4621
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src));
4622
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src));
4623
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst));
4624
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst));
4625
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst));
4626
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst));
4739
+ if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
4740
+ sf0 = (float)(ne0 - 1) / (ne00 - 1);
4741
+ sf1 = (float)(ne1 - 1) / (ne01 - 1);
4742
+ pixel_offset = 0.0f;
4743
+ }
4744
+
4745
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
4746
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
4747
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne0));
4748
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne1));
4749
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne2));
4750
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne3));
4627
4751
  CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
4628
4752
  CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
4629
4753
  CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
4630
4754
  CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
4755
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &pixel_offset));
4631
4756
  }
4632
4757
 
4633
4758
 
4634
- size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
4759
+ size_t dst_total_elements = (size_t)ne0 * ne1 * ne2 * ne3;
4635
4760
  if (dst_total_elements == 0) {
4636
4761
  return;
4637
4762
  }
@@ -4644,17 +4769,7 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4644
4769
  local_work_size_ptr = nullptr;
4645
4770
  }
4646
4771
 
4647
- #ifdef GGML_OPENCL_PROFILING
4648
- cl_event evt;
4649
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4650
-
4651
- g_profiling_info.emplace_back();
4652
- size_t profiling_gws[3] = {global_work_size[0], 1, 1};
4653
- size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1};
4654
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
4655
- #else
4656
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4657
- #endif
4772
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4658
4773
  }
4659
4774
 
4660
4775
  static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4732,7 +4847,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
4732
4847
  global_work_size[1] = d_ne1;
4733
4848
  global_work_size[2] = d_ne2;
4734
4849
 
4735
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
4850
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4736
4851
  }
4737
4852
  }
4738
4853
  } else {
@@ -4782,7 +4897,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
4782
4897
  d_ne2 > 0 ? (size_t)d_ne2 : 1,
4783
4898
  d_ne3 > 0 ? (size_t)d_ne3 : 1 };
4784
4899
 
4785
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL));
4900
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
4786
4901
  }
4787
4902
  }
4788
4903
 
@@ -4795,7 +4910,6 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
4795
4910
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
4796
4911
 
4797
4912
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4798
- cl_command_queue queue = backend_ctx->queue;
4799
4913
 
4800
4914
  if (backend_ctx->kernel_timestep_embedding == nullptr) {
4801
4915
  GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
@@ -4828,17 +4942,59 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
4828
4942
 
4829
4943
  size_t global_work_size[] = {gws0, gws1, 1};
4830
4944
 
4831
- #ifdef GGML_OPENCL_PROFILING
4832
- cl_event evt;
4833
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
4945
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4946
+ }
4834
4947
 
4835
- g_profiling_info.emplace_back();
4836
- size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1};
4837
- size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS
4838
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
4839
- #else
4840
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem
4841
- #endif
4948
+ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4949
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4950
+
4951
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4952
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
4953
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4954
+
4955
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
4956
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
4957
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
4958
+
4959
+ const int M = src0->ne[1];
4960
+ const int N = src1->ne[1];
4961
+ const int K = src0->ne[0];
4962
+
4963
+ cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled;
4964
+
4965
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M));
4966
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N));
4967
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K));
4968
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0->data_device));
4969
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset0));
4970
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra1->data_device));
4971
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offset1));
4972
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), &extrad->data_device));
4973
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
4974
+
4975
+ // Tiling parameters. These need to be tuned for optimal performance.
4976
+ // They must match the #defines in the kernel mul_mat_f16_f32.cl.
4977
+ //
4978
+ // OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN.
4979
+ // TPWM / TPWN: Threads per Work-group. This is the work-group size.
4980
+ // OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements.
4981
+ //
4982
+ // The following relationships must hold:
4983
+ // OPWM = TPWM * OPTM
4984
+ // OPWN = TPWN * OPTN
4985
+ //
4986
+ const int OPWM = 64;
4987
+ const int OPWN = 64;
4988
+ const int TPWM = 16;
4989
+ const int TPWN = 8;
4990
+
4991
+ size_t local_work_size[2] = { TPWM, TPWN };
4992
+ size_t global_work_size[2] = {
4993
+ (size_t) ((M + OPWM - 1) / OPWM) * TPWM,
4994
+ (size_t) ((N + OPWN - 1) / OPWN) * TPWN,
4995
+ };
4996
+
4997
+ backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
4842
4998
  }
4843
4999
 
4844
5000
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4853,7 +5009,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4853
5009
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
4854
5010
 
4855
5011
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4856
- cl_command_queue queue = backend_ctx->queue;
5012
+
5013
+ if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
5014
+ src0->ne[1] > 32 && // M > 32
5015
+ src1->ne[1] > 32 && // N > 32
5016
+ src0->ne[0] > 32 && // K > 32
5017
+ src0->ne[2] == 1 && src0->ne[3] == 1 &&
5018
+ src1->ne[2] == 1 && src1->ne[3] == 1 &&
5019
+ ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
5020
+ backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
5021
+ ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
5022
+ return;
5023
+ }
4857
5024
 
4858
5025
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4859
5026
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -5058,15 +5225,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5058
5225
  static_cast<size_t>(padded_height_B)
5059
5226
  };
5060
5227
 
5061
- #ifdef GGML_OPENCL_PROFILING
5062
- cl_event evt;
5063
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
5064
-
5065
- g_profiling_info.emplace_back();
5066
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
5067
- #else
5068
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
5069
- #endif
5228
+ backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
5070
5229
  } else {
5071
5230
  // no need to transpose B in other cases
5072
5231
  // create an image for B from sub_buffer
@@ -5188,16 +5347,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5188
5347
 
5189
5348
  // enqueue kernel with profiling
5190
5349
  // <--------------------------------------------> //
5191
- #ifdef GGML_OPENCL_PROFILING
5192
- cl_event evt;
5193
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5194
-
5195
- g_profiling_info.emplace_back();
5196
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5197
- // enqueue kernel without profiling
5198
- #else
5199
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5200
- #endif
5350
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5201
5351
  // <--------------------------------------------> //
5202
5352
 
5203
5353
  // deallocate sub buffers and images
@@ -5277,15 +5427,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5277
5427
  global_work_size[2] = (size_t)ne12*ne13;
5278
5428
  }
5279
5429
 
5280
- #ifdef GGML_OPENCL_PROFILING
5281
- cl_event evt;
5282
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5283
-
5284
- g_profiling_info.emplace_back();
5285
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5286
- #else
5287
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5288
- #endif
5430
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5289
5431
  return;
5290
5432
  }
5291
5433
  #else // GGML_OPENCL_SOA_Q
@@ -5515,15 +5657,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5515
5657
  size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
5516
5658
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5517
5659
 
5518
- #ifdef GGML_OPENCL_PROFILING
5519
- cl_event evt;
5520
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5521
-
5522
- g_profiling_info.emplace_back();
5523
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5524
- #else
5525
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5526
- #endif
5660
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5527
5661
  } else if (src0t == GGML_TYPE_Q4_K) {
5528
5662
  GGML_ASSERT(false && "not implemented");
5529
5663
  } else if (src0t == GGML_TYPE_Q3_K) {
@@ -5534,30 +5668,14 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
5534
5668
  size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
5535
5669
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5536
5670
 
5537
- #ifdef GGML_OPENCL_PROFILING
5538
- cl_event evt;
5539
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5540
-
5541
- g_profiling_info.emplace_back();
5542
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5543
- #else
5544
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5545
- #endif
5671
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5546
5672
  } else {
5547
5673
  int64_t ny = (ne11 + nrows - 1)/nrows;
5548
5674
 
5549
5675
  size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
5550
5676
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
5551
5677
 
5552
- #ifdef GGML_OPENCL_PROFILING
5553
- cl_event evt;
5554
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5555
-
5556
- g_profiling_info.emplace_back();
5557
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5558
- #else
5559
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5560
- #endif
5678
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5561
5679
  }
5562
5680
  }
5563
5681
 
@@ -5574,7 +5692,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
5574
5692
  GGML_ASSERT(src2->extra);
5575
5693
 
5576
5694
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5577
- cl_command_queue queue = backend_ctx->queue;
5578
5695
 
5579
5696
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5580
5697
  ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
@@ -5680,15 +5797,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
5680
5797
  size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
5681
5798
  size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
5682
5799
 
5683
- #ifdef GGML_OPENCL_PROFILING
5684
- cl_event evt;
5685
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5686
-
5687
- g_profiling_info.emplace_back();
5688
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5689
- #else
5690
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5691
- #endif
5800
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5692
5801
  }
5693
5802
 
5694
5803
  static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5701,10 +5810,11 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
5701
5810
  GGML_ASSERT(ggml_is_contiguous(src0));
5702
5811
 
5703
5812
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5704
- cl_command_queue queue = backend_ctx->queue;
5705
5813
 
5706
5814
  float scale;
5707
- memcpy(&scale, dst->op_params, sizeof(scale));
5815
+ float bias;
5816
+ memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float));
5817
+ memcpy(&bias, ((int32_t *) dst->op_params) + 1, sizeof(float));
5708
5818
 
5709
5819
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5710
5820
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5719,6 +5829,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
5719
5829
  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
5720
5830
  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
5721
5831
  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
5832
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
5722
5833
 
5723
5834
  int n = ggml_nelements(dst)/4;
5724
5835
 
@@ -5730,15 +5841,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
5730
5841
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5731
5842
  }
5732
5843
 
5733
- #ifdef GGML_OPENCL_PROFILING
5734
- cl_event evt;
5735
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
5736
-
5737
- g_profiling_info.emplace_back();
5738
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
5739
- #else
5740
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
5741
- #endif
5844
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
5742
5845
  }
5743
5846
 
5744
5847
  static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5775,7 +5878,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
5775
5878
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
5776
5879
 
5777
5880
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5778
- cl_command_queue queue = backend_ctx->queue;
5779
5881
 
5780
5882
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5781
5883
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -5840,15 +5942,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
5840
5942
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5841
5943
  size_t local_work_size[] = {(size_t)nth, 1, 1};
5842
5944
 
5843
- #ifdef GGML_OPENCL_PROFILING
5844
- cl_event evt;
5845
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5846
-
5847
- g_profiling_info.emplace_back();
5848
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
5849
- #else
5850
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5851
- #endif
5945
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
5852
5946
  }
5853
5947
 
5854
5948
  static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5871,7 +5965,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5871
5965
  const int ne02 = src0 ? src0->ne[2] : 0;
5872
5966
 
5873
5967
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5874
- cl_command_queue queue = backend_ctx->queue;
5875
5968
 
5876
5969
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5877
5970
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5895,15 +5988,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5895
5988
  size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
5896
5989
  size_t local_work_size[] = {64, 1, 1};
5897
5990
 
5898
- #ifdef GGML_OPENCL_PROFILING
5899
- cl_event evt;
5900
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5901
-
5902
- g_profiling_info.emplace_back();
5903
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5904
- #else
5905
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5906
- #endif
5991
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5907
5992
  } else {
5908
5993
  kernel = backend_ctx->kernel_diag_mask_inf;
5909
5994
 
@@ -5923,15 +6008,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5923
6008
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5924
6009
  }
5925
6010
 
5926
- #ifdef GGML_OPENCL_PROFILING
5927
- cl_event evt;
5928
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
5929
-
5930
- g_profiling_info.emplace_back();
5931
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
5932
- #else
5933
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
5934
- #endif
6011
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
5935
6012
  }
5936
6013
  }
5937
6014
 
@@ -5951,7 +6028,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
5951
6028
  }
5952
6029
 
5953
6030
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5954
- cl_command_queue queue = backend_ctx->queue;
5955
6031
 
5956
6032
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5957
6033
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5963,19 +6039,31 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
5963
6039
 
5964
6040
  cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
5965
6041
 
5966
- const int ne00 = src0 ? src0->ne[0] : 0;
5967
- const int ne01 = src0 ? src0->ne[1] : 0;
5968
- const int ne02 = src0 ? src0->ne[2] : 0;
5969
- const int ne03 = src0 ? src0->ne[3] : 0;
6042
+ const int ne00 = src0->ne[0];
6043
+ const int ne01 = src0->ne[1];
6044
+ const int ne02 = src0->ne[2];
6045
+ const int ne03 = src0->ne[3];
6046
+
6047
+ const cl_long nb01 = src0->nb[1];
6048
+ const cl_long nb02 = src0->nb[2];
6049
+ const cl_long nb03 = src0->nb[3];
6050
+
6051
+ const int ne12 = src1 ? src1->ne[2] : 0;
6052
+ const int ne13 = src1 ? src1->ne[3] : 0;
6053
+
6054
+ const cl_long nb11 = src1 ? src1->nb[1] : 0;
6055
+ const cl_long nb12 = src1 ? src1->nb[2] : 0;
6056
+ const cl_long nb13 = src1 ? src1->nb[3] : 0;
6057
+
6058
+ const cl_long nb1 = dst->nb[1];
6059
+ const cl_long nb2 = dst->nb[2];
6060
+ const cl_long nb3 = dst->nb[3];
5970
6061
 
5971
6062
  float scale, max_bias;
5972
6063
  memcpy(&scale, dst->op_params + 0, sizeof(float));
5973
6064
  memcpy(&max_bias, dst->op_params + 1, sizeof(float));
5974
6065
 
5975
- const int nrows_x = ggml_nrows(src0);
5976
- const int nrows_y = src0->ne[1];
5977
-
5978
- const int n_head = nrows_x/nrows_y;
6066
+ const int n_head = src0->ne[2];
5979
6067
  const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
5980
6068
 
5981
6069
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
@@ -6020,26 +6108,27 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
6020
6108
  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6021
6109
  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6022
6110
  CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
6023
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
6024
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
6025
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(float), &scale));
6026
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &max_bias));
6027
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &m0));
6028
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &m1));
6029
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &n_head_log2));
6111
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
6112
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
6113
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
6114
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
6115
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13));
6116
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
6117
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
6118
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
6119
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb1));
6120
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb2));
6121
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb3));
6122
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &scale));
6123
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(float), &max_bias));
6124
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &m0));
6125
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &m1));
6126
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &n_head_log2));
6030
6127
 
6031
6128
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
6032
6129
  size_t local_work_size[] = {(size_t)nth, 1, 1};
6033
6130
 
6034
- #ifdef GGML_OPENCL_PROFILING
6035
- cl_event evt;
6036
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6037
-
6038
- g_profiling_info.emplace_back();
6039
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6040
- #else
6041
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6042
- #endif
6131
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6043
6132
  }
6044
6133
 
6045
6134
  static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6051,7 +6140,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
6051
6140
  GGML_ASSERT(dst->extra);
6052
6141
 
6053
6142
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6054
- cl_command_queue queue = backend_ctx->queue;
6055
6143
 
6056
6144
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6057
6145
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -6217,15 +6305,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
6217
6305
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
6218
6306
  size_t local_work_size[] = {(size_t)nth, 1, 1};
6219
6307
 
6220
- #ifdef GGML_OPENCL_PROFILING
6221
- cl_event evt;
6222
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6223
-
6224
- g_profiling_info.emplace_back();
6225
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6226
- #else
6227
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6228
- #endif
6308
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6229
6309
  }
6230
6310
 
6231
6311
  static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6240,7 +6320,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
6240
6320
  GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
6241
6321
 
6242
6322
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6243
- cl_command_queue queue = backend_ctx->queue;
6244
6323
 
6245
6324
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6246
6325
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6309,15 +6388,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
6309
6388
  size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
6310
6389
  size_t local_work_size[] = {256, 1, 1};
6311
6390
 
6312
- #ifdef GGML_OPENCL_PROFILING
6313
- cl_event evt;
6314
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6315
-
6316
- g_profiling_info.emplace_back();
6317
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6318
- #else
6319
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6320
- #endif
6391
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6321
6392
  }
6322
6393
 
6323
6394
  static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6332,7 +6403,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
6332
6403
  GGML_ASSERT(ggml_is_contiguous(src0));
6333
6404
 
6334
6405
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6335
- cl_command_queue queue = backend_ctx->queue;
6336
6406
 
6337
6407
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6338
6408
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6364,15 +6434,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
6364
6434
  size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
6365
6435
  size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
6366
6436
 
6367
- #ifdef GGML_OPENCL_PROFILING
6368
- cl_event evt;
6369
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6370
-
6371
- g_profiling_info.emplace_back();
6372
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6373
- #else
6374
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6375
- #endif
6437
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6376
6438
  }
6377
6439
 
6378
6440
  static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6386,7 +6448,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
6386
6448
  GGML_ASSERT(ggml_is_contiguous(src0));
6387
6449
 
6388
6450
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6389
- cl_command_queue queue = backend_ctx->queue;
6390
6451
 
6391
6452
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6392
6453
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -6427,15 +6488,106 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
6427
6488
  size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
6428
6489
  size_t local_work_size[] = {(size_t)64, 1, 1};
6429
6490
 
6430
- #ifdef GGML_OPENCL_PROFILING
6431
- cl_event evt;
6432
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
6491
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6492
+ }
6433
6493
 
6434
- g_profiling_info.emplace_back();
6435
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
6436
- #else
6437
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
6438
- #endif
6494
+ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6495
+ GGML_ASSERT(src0);
6496
+ GGML_ASSERT(src0->extra);
6497
+ GGML_ASSERT(dst);
6498
+ GGML_ASSERT(dst->extra);
6499
+
6500
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
6501
+
6502
+ if (src1) {
6503
+ GGML_ASSERT(src1);
6504
+ GGML_ASSERT(src1->extra);
6505
+ GGML_ASSERT(ggml_are_same_shape(src0, src1));
6506
+ }
6507
+
6508
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6509
+
6510
+ cl_kernel kernel;
6511
+ switch (ggml_get_glu_op(dst)) {
6512
+ case GGML_GLU_OP_GEGLU:
6513
+ if (dst->type == GGML_TYPE_F32) {
6514
+ kernel = backend_ctx->kernel_geglu;
6515
+ } else {
6516
+ kernel = backend_ctx->kernel_geglu_f16;
6517
+ }
6518
+ break;
6519
+ case GGML_GLU_OP_REGLU:
6520
+ if (dst->type == GGML_TYPE_F32) {
6521
+ kernel = backend_ctx->kernel_reglu;
6522
+ } else {
6523
+ kernel = backend_ctx->kernel_reglu_f16;
6524
+ }
6525
+ break;
6526
+ case GGML_GLU_OP_SWIGLU:
6527
+ if (dst->type == GGML_TYPE_F32) {
6528
+ kernel = backend_ctx->kernel_swiglu;
6529
+ } else {
6530
+ kernel = backend_ctx->kernel_swiglu_f16;
6531
+ }
6532
+ break;
6533
+ case GGML_GLU_OP_GEGLU_ERF:
6534
+ if (dst->type == GGML_TYPE_F32) {
6535
+ kernel = backend_ctx->kernel_geglu_erf;
6536
+ } else {
6537
+ kernel = backend_ctx->kernel_geglu_erf_f16;
6538
+ }
6539
+ break;
6540
+ case GGML_GLU_OP_GEGLU_QUICK:
6541
+ if (dst->type == GGML_TYPE_F32) {
6542
+ kernel = backend_ctx->kernel_geglu_quick;
6543
+ } else {
6544
+ kernel = backend_ctx->kernel_geglu_quick_f16;
6545
+ }
6546
+ break;
6547
+ default:
6548
+ GGML_ABORT("Unsupported glu op");
6549
+ }
6550
+
6551
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6552
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6553
+
6554
+ ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
6555
+
6556
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
6557
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
6558
+
6559
+ cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
6560
+
6561
+ const int ne0 = dst->ne[0];
6562
+
6563
+ const cl_ulong nb01 = src0->nb[1];
6564
+ const cl_ulong nb11 = src1 ? src1->nb[1] : nb01;
6565
+
6566
+ const cl_ulong nb1 = dst->nb[1];
6567
+
6568
+ const int swp = ((const int32_t *) dst->op_params)[1];
6569
+ const int ne00_off = src1 ? 0 : (swp ? ne0 : 0);
6570
+ const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
6571
+
6572
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6573
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6574
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), src1 ? &extra1->data_device : &extra0->data_device));
6575
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6576
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6577
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6578
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
6579
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb11));
6580
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
6581
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb1));
6582
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off));
6583
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off));
6584
+
6585
+ const size_t nrows = ggml_nrows(src0);
6586
+ size_t nth = 512;
6587
+ size_t global_work_size[] = {nrows*nth, 1, 1};
6588
+ size_t local_work_size[] = {nth, 1, 1};
6589
+
6590
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6439
6591
  }
6440
6592
 
6441
6593
  //------------------------------------------------------------------------------
@@ -6461,6 +6613,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
6461
6613
  }
6462
6614
  func = ggml_cl_get_rows;
6463
6615
  break;
6616
+ case GGML_OP_SET_ROWS:
6617
+ if (!any_on_device) {
6618
+ return false;
6619
+ }
6620
+ func = ggml_cl_set_rows;
6621
+ break;
6464
6622
  case GGML_OP_CPY:
6465
6623
  if (!any_on_device) {
6466
6624
  return false;
@@ -6506,6 +6664,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
6506
6664
  }
6507
6665
  func = ggml_cl_gelu;
6508
6666
  break;
6667
+ case GGML_UNARY_OP_GELU_ERF:
6668
+ if (!any_on_device) {
6669
+ return false;
6670
+ }
6671
+ func = ggml_cl_gelu_erf;
6672
+ break;
6509
6673
  case GGML_UNARY_OP_GELU_QUICK:
6510
6674
  if (!any_on_device) {
6511
6675
  return false;
@@ -6539,6 +6703,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
6539
6703
  default:
6540
6704
  return false;
6541
6705
  } break;
6706
+ case GGML_OP_GLU:
6707
+ if (!any_on_device) {
6708
+ return false;
6709
+ }
6710
+ func = ggml_cl_glu;
6711
+ break;
6542
6712
  case GGML_OP_CLAMP:
6543
6713
  if (!any_on_device) {
6544
6714
  return false;