@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
231
231
  return { type, major, minor, patch };
232
232
  }
233
233
 
234
+ // Profiling
235
+ struct ProfilingInfo {
236
+ std::string op_name;
237
+ std::string kernel_name;
238
+
239
+ cl_kernel kernel;
240
+ cl_event evt;
241
+
242
+ cl_ulong cmd_queued;
243
+ cl_ulong cmd_submit;
244
+ cl_ulong cmd_start;
245
+ cl_ulong cmd_end;
246
+ cl_ulong overhead_start;
247
+ cl_ulong overhead_end;
248
+ // For the times below, see spec for clGetEventProfilingInfo
249
+ // The time kernel spent in cmd queue - SUBMIT - QUEUED
250
+ cl_ulong cmd_queued_duration_ns;
251
+ // The time kernel spent for submission - START - SUBMIT
252
+ cl_ulong cmd_submit_duration_ns;
253
+ // Kernel execution time in nanoseconds - END - START
254
+ cl_ulong cmd_duration_ns;
255
+ // The time for the kernel to complete - COMPLETE - END
256
+ cl_ulong cmd_complete_duration_ns;
257
+ // Total time to finish the kernel - COMPELTE - QUEUED
258
+ cl_ulong cmd_total_duration_ns;
259
+ // Global and local work sizes.
260
+ size_t global_size[3];
261
+ size_t local_size[3];
262
+ // Op output size.
263
+ size_t output_size[4];
264
+ };
265
+
266
+ static void populateProfilingInfo(
267
+ ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
268
+ size_t global_size[3], size_t local_size[3],
269
+ const ggml_tensor * tensor) {
270
+ info.op_name = tensor->name;
271
+ info.kernel = kernel;
272
+ info.evt = evt;
273
+
274
+ // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
275
+ info.local_size[0] = 0;
276
+ info.local_size[1] = 0;
277
+ info.local_size[2] = 0;
278
+
279
+ info.global_size[0] = 0;
280
+ info.global_size[1] = 0;
281
+ info.global_size[2] = 0;
282
+
283
+ if (local_size) {
284
+ for (cl_uint i = 0; i < work_dim; ++i) {
285
+ info.local_size[i] = local_size[i];
286
+ }
287
+ }
288
+
289
+ for (cl_uint i = 0; i < work_dim; ++i) {
290
+ info.global_size[i] = global_size[i];
291
+ }
292
+
293
+ info.output_size[0] = tensor->ne[0];
294
+ info.output_size[1] = tensor->ne[1];
295
+ info.output_size[2] = tensor->ne[2];
296
+ info.output_size[3] = tensor->ne[3];
297
+ }
298
+
234
299
  struct ggml_backend_opencl_context;
235
300
 
236
301
  // backend device context
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
254
319
 
255
320
  // backend context
256
321
  struct ggml_backend_opencl_context {
322
+ int ref_count;
323
+
257
324
  cl_device_id device;
258
325
  std::string device_name;
259
326
 
@@ -315,6 +382,13 @@ struct ggml_backend_opencl_context {
315
382
  cl_program program_softmax_4_f16;
316
383
  cl_program program_argsort_f32_i32;
317
384
  cl_program program_sum_rows_f32;
385
+ cl_program program_repeat;
386
+ cl_program program_pad;
387
+ cl_program program_tanh;
388
+ cl_program program_upscale;
389
+ cl_program program_concat;
390
+ cl_program program_tsembd;
391
+ cl_program program_mul_mv_id_q4_0_f32_8x_flat;
318
392
 
319
393
  cl_kernel kernel_add, kernel_add_row;
320
394
  cl_kernel kernel_mul, kernel_mul_row;
@@ -351,6 +425,118 @@ struct ggml_backend_opencl_context {
351
425
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
352
426
  cl_kernel kernel_argsort_f32_i32;
353
427
  cl_kernel kernel_sum_rows_f32;
428
+ cl_kernel kernel_repeat;
429
+ cl_kernel kernel_pad;
430
+ cl_kernel kernel_tanh_f32_nd;
431
+ cl_kernel kernel_tanh_f16_nd;
432
+ cl_kernel kernel_upscale;
433
+ cl_kernel kernel_upscale_bilinear;
434
+ cl_kernel kernel_concat_f32_contiguous;
435
+ cl_kernel kernel_concat_f32_non_contiguous;
436
+ cl_kernel kernel_timestep_embedding;
437
+ cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
438
+
439
+ std::vector<ProfilingInfo> profiling_info;
440
+
441
+ void write_profiling_info() {
442
+ FILE * fperf = fopen("cl_profiling.csv", "w");
443
+ if (!fperf) {
444
+ GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
445
+ return;
446
+ }
447
+
448
+ // Populate profiling info
449
+ for (ProfilingInfo & info : profiling_info) {
450
+ cl_ulong cmd_queued;
451
+ cl_ulong cmd_submit;
452
+ cl_ulong cmd_start;
453
+ cl_ulong cmd_end;
454
+ cl_ulong cmd_complete;
455
+
456
+ CL_CHECK(clWaitForEvents(1, &info.evt));
457
+ CL_CHECK(clGetEventProfilingInfo(
458
+ info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
459
+ CL_CHECK(clGetEventProfilingInfo(
460
+ info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
461
+ CL_CHECK(clGetEventProfilingInfo(
462
+ info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
463
+ CL_CHECK(clGetEventProfilingInfo(
464
+ info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
465
+ CL_CHECK(clGetEventProfilingInfo(
466
+ info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
467
+ CL_CHECK(clReleaseEvent(info.evt));
468
+
469
+ char kernel_name[512];
470
+ CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
471
+ sizeof(kernel_name), kernel_name, NULL));
472
+ info.kernel_name = kernel_name;
473
+
474
+ info.cmd_queued = cmd_queued;
475
+ info.cmd_submit = cmd_submit;
476
+ info.cmd_start = cmd_start;
477
+ info.cmd_end = cmd_end;
478
+
479
+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
480
+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
481
+ info.cmd_duration_ns = cmd_end - cmd_start;
482
+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
483
+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
484
+ }
485
+
486
+ // Dump a csv
487
+ float total_kernel_time = 0;
488
+ fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
489
+ for (const ProfilingInfo & info : profiling_info) {
490
+ total_kernel_time += info.cmd_duration_ns/1.e6f;
491
+ fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
492
+ info.op_name.c_str(), info.kernel_name.c_str(),
493
+ info.cmd_queued_duration_ns/1.e6f,
494
+ info.cmd_submit_duration_ns/1.e6f,
495
+ info.cmd_duration_ns/1.e6f,
496
+ info.cmd_complete_duration_ns/1.e6f,
497
+ info.cmd_total_duration_ns/1.e6f,
498
+ info.global_size[0], info.global_size[1], info.global_size[2],
499
+ info.local_size[0], info.local_size[1], info.local_size[2],
500
+ info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
501
+ }
502
+ fclose(fperf);
503
+
504
+ GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
505
+
506
+ // Dump a simple chrome trace
507
+ FILE* ftrace = fopen("cl_trace.json", "w");
508
+ if (!ftrace) {
509
+ GGML_LOG_ERROR("Failed to open cl_trace.json\n");
510
+ return;
511
+ }
512
+
513
+ fprintf(ftrace, "[\n");
514
+ for (const ProfilingInfo & info : profiling_info) {
515
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
516
+ info.kernel_name.c_str(), info.cmd_queued/1000);
517
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
518
+ info.kernel_name.c_str(), info.cmd_submit/1000);
519
+
520
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
521
+ info.kernel_name.c_str(), info.cmd_start/1000);
522
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
523
+ info.kernel_name.c_str(), info.cmd_end/1000);
524
+ }
525
+ fclose(ftrace);
526
+ }
527
+
528
+ void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
529
+ #ifdef GGML_OPENCL_PROFILING
530
+ cl_event evt;
531
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
532
+
533
+ profiling_info.emplace_back();
534
+ populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
535
+ #else
536
+ GGML_UNUSED(tensor);
537
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
538
+ #endif
539
+ }
354
540
 
355
541
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
356
542
  // Transpose kernels
@@ -378,46 +564,19 @@ struct ggml_backend_opencl_context {
378
564
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
379
565
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
380
566
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
381
- };
382
-
383
- // All registered devices with a default device in the front.
384
- static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
385
567
 
386
- // Profiling
568
+ void free() {
569
+ ref_count--;
570
+ if (ref_count == 0) {
387
571
  #ifdef GGML_OPENCL_PROFILING
388
- struct ProfilingInfo {
389
- std::string op_name;
390
- std::string kernel_name;
391
-
392
- cl_kernel kernel;
393
- cl_event evt;
394
-
395
- cl_ulong cmd_queued;
396
- cl_ulong cmd_submit;
397
- cl_ulong cmd_start;
398
- cl_ulong cmd_end;
399
- cl_ulong overhead_start;
400
- cl_ulong overhead_end;
401
- // For the times below, see spec for clGetEventProfilingInfo
402
- // The time kernel spent in cmd queue - SUBMIT - QUEUED
403
- cl_ulong cmd_queued_duration_ns;
404
- // The time kernel spent for submission - START - SUBMIT
405
- cl_ulong cmd_submit_duration_ns;
406
- // Kernel execution time in nanoseconds - END - START
407
- cl_ulong cmd_duration_ns;
408
- // The time for the kernel to complete - COMPLETE - END
409
- cl_ulong cmd_complete_duration_ns;
410
- // Total time to finish the kernel - COMPELTE - QUEUED
411
- cl_ulong cmd_total_duration_ns;
412
- // Global and local work sizes.
413
- size_t global_size[3];
414
- size_t local_size[3];
415
- // Op output size.
416
- size_t output_size[4];
572
+ write_profiling_info();
573
+ #endif
574
+ }
575
+ }
417
576
  };
418
577
 
419
- std::vector<ProfilingInfo> g_profiling_info;
420
- #endif
578
+ // All registered devices with a default device in the front.
579
+ static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
421
580
 
422
581
  inline std::string read_file(const std::string &path) {
423
582
  std::ifstream ifs(path);
@@ -1097,6 +1256,166 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1097
1256
  GGML_LOG_CONT(".");
1098
1257
  }
1099
1258
 
1259
+ // repeat
1260
+ {
1261
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1262
+ const std::string kernel_src {
1263
+ #include "repeat.cl.h"
1264
+ };
1265
+ #else
1266
+ const std::string kernel_src = read_file("repeat.cl");
1267
+ #endif
1268
+ if (!kernel_src.empty()) {
1269
+ backend_ctx->program_repeat =
1270
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1271
+ CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
1272
+ GGML_LOG_CONT(".");
1273
+ } else {
1274
+ GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
1275
+ backend_ctx->program_repeat = nullptr;
1276
+ backend_ctx->kernel_repeat = nullptr;
1277
+ }
1278
+ }
1279
+
1280
+ // pad
1281
+ {
1282
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1283
+ const std::string kernel_src {
1284
+ #include "pad.cl.h"
1285
+ };
1286
+ #else
1287
+ const std::string kernel_src = read_file("pad.cl");
1288
+ #endif
1289
+ if (!kernel_src.empty()) {
1290
+ backend_ctx->program_pad =
1291
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1292
+ CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err));
1293
+ GGML_LOG_CONT(".");
1294
+ } else {
1295
+ GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n");
1296
+ backend_ctx->program_pad = nullptr;
1297
+ backend_ctx->kernel_pad = nullptr;
1298
+ }
1299
+ }
1300
+
1301
+ // tanh
1302
+ {
1303
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1304
+ const std::string kernel_src {
1305
+ #include "tanh.cl.h"
1306
+ };
1307
+ #else
1308
+ const std::string kernel_src = read_file("tanh.cl");
1309
+ #endif
1310
+ if (!kernel_src.empty()) {
1311
+ backend_ctx->program_tanh =
1312
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1313
+ CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
1314
+ CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
1315
+ GGML_LOG_CONT(".");
1316
+ } else {
1317
+ GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
1318
+ backend_ctx->program_tanh = nullptr;
1319
+ backend_ctx->kernel_tanh_f32_nd = nullptr;
1320
+ backend_ctx->kernel_tanh_f16_nd = nullptr;
1321
+ }
1322
+ }
1323
+
1324
+ // upscale
1325
+ {
1326
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1327
+ const std::string kernel_src {
1328
+ #include "upscale.cl.h"
1329
+ };
1330
+ #else
1331
+ const std::string kernel_src = read_file("upscale.cl");
1332
+ #endif
1333
+ if (!kernel_src.empty()) {
1334
+ backend_ctx->program_upscale =
1335
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1336
+ CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
1337
+ if (backend_ctx->program_upscale) {
1338
+ cl_int err_bilinear;
1339
+ backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
1340
+ if (err_bilinear != CL_SUCCESS) {
1341
+ GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
1342
+ backend_ctx->kernel_upscale_bilinear = nullptr;
1343
+ }
1344
+ } else {
1345
+ backend_ctx->kernel_upscale_bilinear = nullptr;
1346
+ }
1347
+ GGML_LOG_CONT(".");
1348
+ } else {
1349
+ GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n");
1350
+ backend_ctx->program_upscale = nullptr;
1351
+ backend_ctx->kernel_upscale = nullptr;
1352
+ backend_ctx->kernel_upscale_bilinear = nullptr;
1353
+ }
1354
+ }
1355
+
1356
+ // concat
1357
+ {
1358
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1359
+ const std::string kernel_src {
1360
+ #include "concat.cl.h"
1361
+ };
1362
+ #else
1363
+
1364
+ const std::string kernel_src = read_file("concat.cl");
1365
+ #endif
1366
+ if (!kernel_src.empty()) {
1367
+ backend_ctx->program_concat =
1368
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1369
+
1370
+ CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
1371
+ CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
1372
+ GGML_LOG_CONT(".");
1373
+ } else {
1374
+ GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
1375
+ backend_ctx->program_concat = nullptr;
1376
+ backend_ctx->kernel_concat_f32_contiguous = nullptr;
1377
+ backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
1378
+ }
1379
+ }
1380
+
1381
+ // timestep_embedding
1382
+ {
1383
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1384
+ const std::string kernel_src {
1385
+ #include "tsembd.cl.h"
1386
+ };
1387
+ #else
1388
+
1389
+ const std::string kernel_src = read_file("tsembd.cl");
1390
+ #endif
1391
+ if (!kernel_src.empty()) {
1392
+ backend_ctx->program_tsembd =
1393
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1394
+ CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
1395
+ GGML_LOG_CONT(".");
1396
+ } else {
1397
+ GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
1398
+ backend_ctx->program_tsembd = nullptr;
1399
+ backend_ctx->kernel_timestep_embedding = nullptr;
1400
+ }
1401
+ }
1402
+
1403
+ // mul_mv_id_q4_0_f32_8x_flat
1404
+ {
1405
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1406
+ const std::string kernel_src {
1407
+ #include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
1408
+ };
1409
+ #else
1410
+ const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
1411
+ #endif
1412
+ backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
1413
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1414
+
1415
+ CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
1416
+ GGML_LOG_CONT(".");
1417
+ }
1418
+
1100
1419
  // Adreno kernels
1101
1420
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1102
1421
  // transpose
@@ -1492,6 +1811,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1492
1811
  backend_ctx->device = dev_ctx->device;
1493
1812
  backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1494
1813
 
1814
+ // ref_count get increased in ggml_backend_opencl_device_init
1815
+ // This function is also used to retrieve backend context, so we don't want
1816
+ // to increase ref_count for each call. We only want to increase ref_count
1817
+ // when the associated device is initialized
1818
+ backend_ctx->ref_count = 0;
1819
+
1495
1820
  if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
1496
1821
  strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
1497
1822
  strstr(dev_ctx->device_version.c_str(), "Adreno")) {
@@ -1664,93 +1989,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1664
1989
  return dev_ctx->backend_ctx;
1665
1990
  }
1666
1991
 
1667
- static void ggml_cl2_free(void) {
1668
- #ifdef GGML_OPENCL_PROFILING
1669
- FILE * fperf = fopen("cl_profiling.csv", "w");
1670
- if (!fperf) {
1671
- GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
1672
- return;
1673
- }
1992
+ static void ggml_cl2_free(ggml_backend_t backend) {
1993
+ ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
1994
+ ctx->free();
1674
1995
 
1675
- // Populate profiling info
1676
- for (ProfilingInfo & info : g_profiling_info) {
1677
- cl_ulong cmd_queued;
1678
- cl_ulong cmd_submit;
1679
- cl_ulong cmd_start;
1680
- cl_ulong cmd_end;
1681
- cl_ulong cmd_complete;
1682
-
1683
- CL_CHECK(clWaitForEvents(1, &info.evt));
1684
- CL_CHECK(clGetEventProfilingInfo(
1685
- info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
1686
- CL_CHECK(clGetEventProfilingInfo(
1687
- info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
1688
- CL_CHECK(clGetEventProfilingInfo(
1689
- info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
1690
- CL_CHECK(clGetEventProfilingInfo(
1691
- info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
1692
- CL_CHECK(clGetEventProfilingInfo(
1693
- info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
1694
- CL_CHECK(clReleaseEvent(info.evt));
1695
-
1696
- char kernel_name[512];
1697
- CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
1698
- sizeof(kernel_name), kernel_name, NULL));
1699
- info.kernel_name = kernel_name;
1700
-
1701
- info.cmd_queued = cmd_queued;
1702
- info.cmd_submit = cmd_submit;
1703
- info.cmd_start = cmd_start;
1704
- info.cmd_end = cmd_end;
1705
-
1706
- info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
1707
- info.cmd_submit_duration_ns = cmd_start - cmd_submit;
1708
- info.cmd_duration_ns = cmd_end - cmd_start;
1709
- info.cmd_complete_duration_ns = cmd_complete - cmd_end;
1710
- info.cmd_total_duration_ns = cmd_complete - cmd_queued;
1711
- }
1712
-
1713
- // Dump a csv
1714
- float total_kernel_time = 0;
1715
- fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
1716
- for (const ProfilingInfo & info : g_profiling_info) {
1717
- total_kernel_time += info.cmd_duration_ns/1.e6f;
1718
- fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
1719
- info.op_name.c_str(), info.kernel_name.c_str(),
1720
- info.cmd_queued_duration_ns/1.e6f,
1721
- info.cmd_submit_duration_ns/1.e6f,
1722
- info.cmd_duration_ns/1.e6f,
1723
- info.cmd_complete_duration_ns/1.e6f,
1724
- info.cmd_total_duration_ns/1.e6f,
1725
- info.global_size[0], info.global_size[1], info.global_size[2],
1726
- info.local_size[0], info.local_size[1], info.local_size[2],
1727
- info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
1728
- }
1729
- fclose(fperf);
1730
-
1731
- GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
1732
-
1733
- // Dump a simple chrome trace
1734
- FILE* ftrace = fopen("cl_trace.json", "w");
1735
- if (!ftrace) {
1736
- GGML_LOG_ERROR("Failed to open cl_trace.json\n");
1737
- return;
1996
+ // The CL context is shared by all backends, release it if all backends have been released
1997
+ bool should_release_opencl = true;
1998
+ for (auto device : g_ggml_backend_opencl_devices) {
1999
+ ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
2000
+ if (ctx_dev->backend_ctx->ref_count > 0) {
2001
+ should_release_opencl = false;
2002
+ }
1738
2003
  }
1739
2004
 
1740
- fprintf(ftrace, "[\n");
1741
- for (const ProfilingInfo & info : g_profiling_info) {
1742
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
1743
- info.kernel_name.c_str(), info.cmd_queued/1000);
1744
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
1745
- info.kernel_name.c_str(), info.cmd_submit/1000);
1746
-
1747
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1748
- info.kernel_name.c_str(), info.cmd_start/1000);
1749
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1750
- info.kernel_name.c_str(), info.cmd_end/1000);
2005
+ if (should_release_opencl) {
2006
+ CL_CHECK(clReleaseContext(ctx->context));
1751
2007
  }
1752
- fclose(ftrace);
1753
- #endif
1754
2008
  }
1755
2009
 
1756
2010
  //------------------------------------------------------------------------------
@@ -1834,9 +2088,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
1834
2088
  }
1835
2089
 
1836
2090
  static void ggml_backend_opencl_free(ggml_backend_t backend) {
1837
- ggml_cl2_free();
1838
-
1839
- GGML_UNUSED(backend);
2091
+ ggml_cl2_free(backend);
1840
2092
  }
1841
2093
 
1842
2094
  static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -1863,7 +2115,12 @@ static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const g
1863
2115
  }
1864
2116
 
1865
2117
  static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
1866
- GGML_UNUSED(backend);
2118
+ auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
2119
+
2120
+ cl_event evt;
2121
+ CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
2122
+ CL_CHECK(clWaitForEvents(1, &evt));
2123
+ CL_CHECK(clReleaseEvent(evt));
1867
2124
  }
1868
2125
 
1869
2126
  // Syncronizes the 'backend_ctx's device with others so that commands
@@ -1976,9 +2233,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1976
2233
  case GGML_UNARY_OP_SILU:
1977
2234
  case GGML_UNARY_OP_RELU:
1978
2235
  case GGML_UNARY_OP_GELU_QUICK:
1979
- return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
2236
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1980
2237
  case GGML_UNARY_OP_SIGMOID:
1981
2238
  return ggml_is_contiguous(op->src[0]);
2239
+ case GGML_UNARY_OP_TANH:
2240
+ return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
2241
+ (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
1982
2242
  default:
1983
2243
  return false;
1984
2244
  }
@@ -1988,6 +2248,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1988
2248
  case GGML_OP_NORM:
1989
2249
  case GGML_OP_RMS_NORM:
1990
2250
  return true;
2251
+ case GGML_OP_REPEAT:
2252
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
2253
+ case GGML_OP_PAD:
2254
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
2255
+ op->src[0]->ne[3] == 1 && op->ne[3] == 1;
2256
+ case GGML_OP_UPSCALE:
2257
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
2258
+ case GGML_OP_CONCAT:
2259
+ return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
2260
+ case GGML_OP_TIMESTEP_EMBEDDING:
2261
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
1991
2262
  case GGML_OP_GROUP_NORM:
1992
2263
  return ggml_is_contiguous(op->src[0]);
1993
2264
  case GGML_OP_MUL_MAT:
@@ -2000,6 +2271,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2000
2271
  return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2001
2272
  }
2002
2273
  return false;
2274
+ case GGML_OP_MUL_MAT_ID:
2275
+ if (op->src[0]->type == GGML_TYPE_Q4_0) {
2276
+ if (op->src[1]->type == GGML_TYPE_F32) {
2277
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2278
+ }
2279
+ }
2280
+ return false;
2003
2281
  case GGML_OP_RESHAPE:
2004
2282
  case GGML_OP_VIEW:
2005
2283
  case GGML_OP_PERMUTE:
@@ -2052,7 +2330,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
2052
2330
  /* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
2053
2331
  /* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
2054
2332
  /* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
2055
- /* .synchronize = */ NULL, /* ggml_backend_opencl_synchronize */
2333
+ /* .synchronize = */ ggml_backend_opencl_synchronize,
2056
2334
  /* .graph_plan_create = */ NULL,
2057
2335
  /* .graph_plan_free = */ NULL,
2058
2336
  /* .graph_plan_update = */ NULL,
@@ -2696,6 +2974,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
2696
2974
 
2697
2975
  static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
2698
2976
  ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
2977
+ // Getting a new reference to the backend, increase ref_count
2978
+ backend_ctx->ref_count++;
2699
2979
 
2700
2980
  ggml_backend_t backend = new ggml_backend {
2701
2981
  /* .guid = */ ggml_backend_opencl_guid(),
@@ -2956,31 +3236,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
2956
3236
  #define dump_tensor(tensor)
2957
3237
  #endif
2958
3238
 
2959
- //------------------------------------------------------------------------------
2960
- // Profiling utility
2961
- //------------------------------------------------------------------------------
2962
- #ifdef GGML_OPENCL_PROFILING
2963
- static void populateProfilingInfo(
2964
- ProfilingInfo& info, cl_event evt, cl_kernel kernel,
2965
- size_t global_size[3], size_t local_size[3],
2966
- const ggml_tensor * tensor) {
2967
- info.op_name = tensor->name;
2968
- info.kernel = kernel;
2969
- info.evt = evt;
2970
-
2971
- info.local_size[0] = local_size[0];
2972
- info.local_size[1] = local_size[1];
2973
- info.local_size[2] = local_size[2];
2974
- info.global_size[0] = global_size[0];
2975
- info.global_size[1] = global_size[1];
2976
- info.global_size[2] = global_size[2];
2977
- info.output_size[0] = tensor->ne[0];
2978
- info.output_size[1] = tensor->ne[1];
2979
- info.output_size[2] = tensor->ne[2];
2980
- info.output_size[3] = tensor->ne[3];
2981
- }
2982
- #endif
2983
-
2984
3239
  //------------------------------------------------------------------------------
2985
3240
  // Ops
2986
3241
  //------------------------------------------------------------------------------
@@ -3024,7 +3279,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3024
3279
  const cl_ulong nb2 = dst ? dst->nb[2] : 0;
3025
3280
 
3026
3281
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3027
- cl_command_queue queue = backend_ctx->queue;
3028
3282
 
3029
3283
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3030
3284
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3068,15 +3322,7 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3068
3322
  size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
3069
3323
  size_t local_work_size[] = {1, 1, 1};
3070
3324
 
3071
- #ifdef GGML_OPENCL_PROFILING
3072
- cl_event evt;
3073
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3074
-
3075
- g_profiling_info.emplace_back();
3076
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3077
- #else
3078
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3079
- #endif
3325
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3080
3326
  }
3081
3327
 
3082
3328
  static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3118,7 +3364,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3118
3364
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3119
3365
 
3120
3366
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3121
- cl_command_queue queue = backend_ctx->queue;
3122
3367
 
3123
3368
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3124
3369
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3193,29 +3438,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3193
3438
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3194
3439
  }
3195
3440
 
3196
- #ifdef GGML_OPENCL_PROFILING
3197
- cl_event evt;
3198
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3199
-
3200
- g_profiling_info.emplace_back();
3201
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3202
- #else
3203
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3204
- #endif
3441
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3205
3442
  } else {
3206
3443
  unsigned int nth = MIN(64, ne0);
3207
3444
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3208
3445
  size_t local_work_size[] = {nth, 1, 1};
3209
3446
 
3210
- #ifdef GGML_OPENCL_PROFILING
3211
- cl_event evt;
3212
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3213
-
3214
- g_profiling_info.emplace_back();
3215
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3216
- #else
3217
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3218
- #endif
3447
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3219
3448
  }
3220
3449
  }
3221
3450
 
@@ -3258,7 +3487,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3258
3487
  const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3259
3488
 
3260
3489
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3261
- cl_command_queue queue = backend_ctx->queue;
3262
3490
 
3263
3491
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3264
3492
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3333,29 +3561,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3333
3561
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3334
3562
  }
3335
3563
 
3336
- #ifdef GGML_OPENCL_PROFILING
3337
- cl_event evt;
3338
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3339
-
3340
- g_profiling_info.emplace_back();
3341
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3342
- #else
3343
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3344
- #endif
3564
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3345
3565
  } else {
3346
3566
  unsigned int nth = MIN(64, ne0);
3347
3567
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3348
3568
  size_t local_work_size[] = {nth, 1, 1};
3349
3569
 
3350
- #ifdef GGML_OPENCL_PROFILING
3351
- cl_event evt;
3352
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3353
-
3354
- g_profiling_info.emplace_back();
3355
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3356
- #else
3357
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3358
- #endif
3570
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3359
3571
  }
3360
3572
  }
3361
3573
 
@@ -3395,7 +3607,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3395
3607
  const cl_ulong nb3 = dst->nb[3];
3396
3608
 
3397
3609
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3398
- cl_command_queue queue = backend_ctx->queue;
3399
3610
 
3400
3611
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3401
3612
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3458,29 +3669,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3458
3669
  size_t global_work_size[] = {(size_t)n, 1, 1};
3459
3670
  size_t local_work_size[] = {64, 1, 1};
3460
3671
 
3461
- #ifdef GGML_OPENCL_PROFILING
3462
- cl_event evt;
3463
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3464
-
3465
- g_profiling_info.emplace_back();
3466
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3467
- #else
3468
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3469
- #endif
3672
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3470
3673
  } else {
3471
3674
  unsigned int nth = MIN(64, ne0);
3472
3675
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3473
3676
  size_t local_work_size[] = {nth, 1, 1};
3474
3677
 
3475
- #ifdef GGML_OPENCL_PROFILING
3476
- cl_event evt;
3477
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3478
-
3479
- g_profiling_info.emplace_back();
3480
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3481
- #else
3482
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3483
- #endif
3678
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3484
3679
  }
3485
3680
  }
3486
3681
 
@@ -3520,7 +3715,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
3520
3715
  const cl_ulong nb3 = dst->nb[3];
3521
3716
 
3522
3717
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3523
- cl_command_queue queue = backend_ctx->queue;
3524
3718
 
3525
3719
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3526
3720
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3583,29 +3777,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
3583
3777
  size_t global_work_size[] = {(size_t)n, 1, 1};
3584
3778
  size_t local_work_size[] = {64, 1, 1};
3585
3779
 
3586
- #ifdef GGML_OPENCL_PROFILING
3587
- cl_event evt;
3588
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3589
-
3590
- g_profiling_info.emplace_back();
3591
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3592
- #else
3593
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3594
- #endif
3780
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3595
3781
  } else {
3596
3782
  unsigned int nth = MIN(64, ne0);
3597
3783
  size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3598
3784
  size_t local_work_size[] = {nth, 1, 1};
3599
3785
 
3600
- #ifdef GGML_OPENCL_PROFILING
3601
- cl_event evt;
3602
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3603
-
3604
- g_profiling_info.emplace_back();
3605
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3606
- #else
3607
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3608
- #endif
3786
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3609
3787
  }
3610
3788
  }
3611
3789
 
@@ -3618,7 +3796,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3618
3796
  UNUSED(src1);
3619
3797
 
3620
3798
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3621
- cl_command_queue queue = backend_ctx->queue;
3622
3799
 
3623
3800
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3624
3801
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3645,15 +3822,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3645
3822
  size_t global_work_size[] = {(size_t)n, 1, 1};
3646
3823
  size_t local_work_size[] = {64, 1, 1};
3647
3824
 
3648
- #ifdef GGML_OPENCL_PROFILING
3649
- cl_event evt;
3650
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3651
-
3652
- g_profiling_info.emplace_back();
3653
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3654
- #else
3655
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3656
- #endif
3825
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3657
3826
  }
3658
3827
 
3659
3828
  static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3665,7 +3834,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
3665
3834
  UNUSED(src1);
3666
3835
 
3667
3836
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3668
- cl_command_queue queue = backend_ctx->queue;
3669
3837
 
3670
3838
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3671
3839
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3692,15 +3860,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
3692
3860
  size_t global_work_size[] = {(size_t)n, 1, 1};
3693
3861
  size_t local_work_size[] = {64, 1, 1};
3694
3862
 
3695
- #ifdef GGML_OPENCL_PROFILING
3696
- cl_event evt;
3697
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3698
-
3699
- g_profiling_info.emplace_back();
3700
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3701
- #else
3702
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3703
- #endif
3863
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3704
3864
  }
3705
3865
 
3706
3866
  static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3712,7 +3872,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3712
3872
  UNUSED(src1);
3713
3873
 
3714
3874
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3715
- cl_command_queue queue = backend_ctx->queue;
3716
3875
 
3717
3876
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3718
3877
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3744,15 +3903,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3744
3903
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3745
3904
  }
3746
3905
 
3747
- #ifdef GGML_OPENCL_PROFILING
3748
- cl_event evt;
3749
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3750
-
3751
- g_profiling_info.emplace_back();
3752
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3753
- #else
3754
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3755
- #endif
3906
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3756
3907
  }
3757
3908
 
3758
3909
  static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3764,7 +3915,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3764
3915
  UNUSED(src1);
3765
3916
 
3766
3917
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3767
- cl_command_queue queue = backend_ctx->queue;
3768
3918
 
3769
3919
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3770
3920
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3789,15 +3939,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3789
3939
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3790
3940
  }
3791
3941
 
3792
- #ifdef GGML_OPENCL_PROFILING
3793
- cl_event evt;
3794
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3795
-
3796
- g_profiling_info.emplace_back();
3797
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3798
- #else
3799
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3800
- #endif
3942
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3801
3943
  }
3802
3944
 
3803
3945
  static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3809,7 +3951,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
3809
3951
  UNUSED(src1);
3810
3952
 
3811
3953
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3812
- cl_command_queue queue = backend_ctx->queue;
3813
3954
 
3814
3955
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3815
3956
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3841,15 +3982,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
3841
3982
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3842
3983
  }
3843
3984
 
3844
- #ifdef GGML_OPENCL_PROFILING
3845
- cl_event evt;
3846
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3847
-
3848
- g_profiling_info.emplace_back();
3849
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3850
- #else
3851
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3852
- #endif
3985
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3853
3986
  }
3854
3987
 
3855
3988
  static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3861,7 +3994,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
3861
3994
  UNUSED(src1);
3862
3995
 
3863
3996
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3864
- cl_command_queue queue = backend_ctx->queue;
3865
3997
 
3866
3998
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3867
3999
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3893,15 +4025,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
3893
4025
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3894
4026
  }
3895
4027
 
3896
- #ifdef GGML_OPENCL_PROFILING
3897
- cl_event evt;
3898
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3899
-
3900
- g_profiling_info.emplace_back();
3901
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3902
- #else
3903
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3904
- #endif
4028
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
3905
4029
  }
3906
4030
 
3907
4031
  static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3913,7 +4037,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
3913
4037
  UNUSED(src1);
3914
4038
 
3915
4039
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3916
- cl_command_queue queue = backend_ctx->queue;
3917
4040
 
3918
4041
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3919
4042
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3954,15 +4077,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
3954
4077
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
3955
4078
  size_t local_work_size[] = {(size_t)nth, 1, 1};
3956
4079
 
3957
- #ifdef GGML_OPENCL_PROFILING
3958
- cl_event evt;
3959
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3960
-
3961
- g_profiling_info.emplace_back();
3962
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3963
- #else
3964
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3965
- #endif
4080
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3966
4081
  }
3967
4082
 
3968
4083
  static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3974,7 +4089,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
3974
4089
  UNUSED(src1);
3975
4090
 
3976
4091
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3977
- cl_command_queue queue = backend_ctx->queue;
3978
4092
 
3979
4093
  //ggml_backend_opencl_device_context * dev_ctx =
3980
4094
  // (ggml_backend_opencl_device_context *)backend->device->context;
@@ -4038,15 +4152,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
4038
4152
  // This is local memory - the size depends on subgroup size.
4039
4153
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
4040
4154
 
4041
- #ifdef GGML_OPENCL_PROFILING
4042
- cl_event evt;
4043
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4044
-
4045
- g_profiling_info.emplace_back();
4046
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4047
- #else
4048
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4049
- #endif
4155
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4050
4156
  }
4051
4157
 
4052
4158
  static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4058,7 +4164,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
4058
4164
  UNUSED(src1);
4059
4165
 
4060
4166
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4061
- cl_command_queue queue = backend_ctx->queue;
4062
4167
 
4063
4168
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4064
4169
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4097,15 +4202,487 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
4097
4202
  size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
4098
4203
  size_t local_work_size[] = {(size_t)sgs, 1, 1};
4099
4204
 
4100
- #ifdef GGML_OPENCL_PROFILING
4101
- cl_event evt;
4102
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4205
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4206
+ }
4103
4207
 
4104
- g_profiling_info.emplace_back();
4105
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4106
- #else
4107
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4108
- #endif
4208
+ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4209
+ GGML_ASSERT(src0);
4210
+ GGML_ASSERT(src0->extra);
4211
+ GGML_ASSERT(dst);
4212
+ GGML_ASSERT(dst->extra);
4213
+
4214
+ UNUSED(src1);
4215
+
4216
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4217
+
4218
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4219
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4220
+
4221
+ cl_ulong offset0_abs = extra0->offset + src0->view_offs;
4222
+ cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
4223
+
4224
+ cl_kernel kernel;
4225
+ if (dst->type == GGML_TYPE_F32) {
4226
+ kernel = backend_ctx->kernel_tanh_f32_nd;
4227
+ } else if (dst->type == GGML_TYPE_F16) {
4228
+ kernel = backend_ctx->kernel_tanh_f16_nd;
4229
+ } else {
4230
+ GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
4231
+ }
4232
+ GGML_ASSERT(kernel != nullptr);
4233
+
4234
+ const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
4235
+ const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
4236
+
4237
+ const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
4238
+ const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
4239
+
4240
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
4241
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
4242
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4243
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
4244
+
4245
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
4246
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
4247
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
4248
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
4249
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
4250
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
4251
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
4252
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
4253
+
4254
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
4255
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
4256
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
4257
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
4258
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
4259
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
4260
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
4261
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
4262
+
4263
+ size_t global_work_size[3];
4264
+ if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
4265
+ return;
4266
+ }
4267
+ global_work_size[0] = (size_t)ne10;
4268
+ global_work_size[1] = (size_t)ne11;
4269
+ global_work_size[2] = (size_t)ne12;
4270
+
4271
+ size_t lws0 = 16, lws1 = 4, lws2 = 1;
4272
+ if (ne10 < 16) lws0 = ne10;
4273
+ if (ne11 < 4) lws1 = ne11;
4274
+ if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
4275
+
4276
+ while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
4277
+ while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
4278
+ while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
4279
+
4280
+
4281
+ size_t local_work_size[] = {lws0, lws1, lws2};
4282
+
4283
+ size_t* local_work_size_ptr = local_work_size;
4284
+ if (!backend_ctx->non_uniform_workgroups) {
4285
+ if (global_work_size[0] % local_work_size[0] != 0 ||
4286
+ global_work_size[1] % local_work_size[1] != 0 ||
4287
+ global_work_size[2] % local_work_size[2] != 0) {
4288
+ local_work_size_ptr = NULL;
4289
+ }
4290
+ }
4291
+ if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
4292
+
4293
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4294
+ }
4295
+
4296
+ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
4297
+ GGML_ASSERT(src0);
4298
+ GGML_ASSERT(src0->extra);
4299
+ GGML_ASSERT(dst);
4300
+ GGML_ASSERT(dst->extra);
4301
+ GGML_ASSERT(dst->type == src0->type);
4302
+
4303
+ UNUSED(src1_shape_def);
4304
+
4305
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4306
+
4307
+ if (backend_ctx->kernel_repeat == nullptr) {
4308
+ GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
4309
+ return;
4310
+ }
4311
+
4312
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4313
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4314
+
4315
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4316
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4317
+
4318
+ const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
4319
+ const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
4320
+
4321
+ const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
4322
+ const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
4323
+
4324
+ cl_kernel kernel = backend_ctx->kernel_repeat;
4325
+
4326
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4327
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device));
4328
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
4329
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4330
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
4331
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
4332
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
4333
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
4334
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
4335
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
4336
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
4337
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
4338
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
4339
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
4340
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
4341
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
4342
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
4343
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
4344
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
4345
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
4346
+
4347
+ size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
4348
+ size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
4349
+ size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
4350
+
4351
+ size_t global_work_size[] = { gws0, gws1, gws2 };
4352
+
4353
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4354
+ }
4355
+
4356
+ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
4357
+ GGML_ASSERT(src0);
4358
+ GGML_ASSERT(src0->extra);
4359
+ GGML_ASSERT(dst);
4360
+ GGML_ASSERT(dst->extra);
4361
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4362
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4363
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
4364
+
4365
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4366
+
4367
+ if (backend_ctx->kernel_pad == nullptr) {
4368
+ GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
4369
+ return;
4370
+ }
4371
+
4372
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4373
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4374
+
4375
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4376
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4377
+
4378
+ const int s_ne0 = src0->ne[0];
4379
+ const int s_ne1 = src0->ne[1];
4380
+ const int s_ne2 = src0->ne[2];
4381
+
4382
+ const int d_ne0 = dst->ne[0];
4383
+ const int d_ne1 = dst->ne[1];
4384
+ const int d_ne2 = dst->ne[2];
4385
+
4386
+ cl_kernel kernel = backend_ctx->kernel_pad;
4387
+
4388
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4389
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4390
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
4391
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4392
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
4393
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
4394
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
4395
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne0));
4396
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne1));
4397
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne2));
4398
+
4399
+ size_t lws0 = 64;
4400
+ size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
4401
+
4402
+ size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
4403
+ size_t local_work_size[] = { lws0, 1, 1 };
4404
+
4405
+ size_t * local_work_size_ptr = local_work_size;
4406
+ if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
4407
+ local_work_size_ptr = nullptr;
4408
+ }
4409
+
4410
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4411
+ }
4412
+
4413
+ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
4414
+ GGML_ASSERT(src0);
4415
+ GGML_ASSERT(src0->extra);
4416
+ GGML_ASSERT(dst);
4417
+ GGML_ASSERT(dst->extra);
4418
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4419
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4420
+
4421
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4422
+
4423
+ const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
4424
+ cl_kernel kernel = nullptr;
4425
+
4426
+ if (mode == GGML_SCALE_MODE_NEAREST) {
4427
+ kernel = backend_ctx->kernel_upscale;
4428
+ if (kernel == nullptr) {
4429
+ GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__);
4430
+ return;
4431
+ }
4432
+ } else if (mode == GGML_SCALE_MODE_BILINEAR) {
4433
+ kernel = backend_ctx->kernel_upscale_bilinear;
4434
+ if (kernel == nullptr) {
4435
+ GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__);
4436
+ return;
4437
+ }
4438
+ } else {
4439
+ GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode);
4440
+ return;
4441
+ }
4442
+
4443
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4444
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4445
+
4446
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4447
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4448
+
4449
+ const cl_ulong nb00 = src0->nb[0];
4450
+ const cl_ulong nb01 = src0->nb[1];
4451
+ const cl_ulong nb02 = src0->nb[2];
4452
+ const cl_ulong nb03 = src0->nb[3];
4453
+
4454
+ const int ne00_src = src0->ne[0];
4455
+ const int ne01_src = src0->ne[1];
4456
+
4457
+ const int ne10_dst = dst->ne[0];
4458
+ const int ne11_dst = dst->ne[1];
4459
+ const int ne12_dst = dst->ne[2];
4460
+ const int ne13_dst = dst->ne[3];
4461
+
4462
+ const float sf0 = (float)dst->ne[0] / src0->ne[0];
4463
+ const float sf1 = (float)dst->ne[1] / src0->ne[1];
4464
+ const float sf2 = (float)dst->ne[2] / src0->ne[2];
4465
+ const float sf3 = (float)dst->ne[3] / src0->ne[3];
4466
+
4467
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4468
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4469
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
4470
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4471
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb00));
4472
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
4473
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb02));
4474
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
4475
+
4476
+ if (mode == GGML_SCALE_MODE_NEAREST) {
4477
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst));
4478
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst));
4479
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst));
4480
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst));
4481
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
4482
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
4483
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
4484
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
4485
+ } else if (mode == GGML_SCALE_MODE_BILINEAR) {
4486
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src));
4487
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src));
4488
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst));
4489
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst));
4490
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst));
4491
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst));
4492
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
4493
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
4494
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
4495
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
4496
+ }
4497
+
4498
+
4499
+ size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
4500
+ if (dst_total_elements == 0) {
4501
+ return;
4502
+ }
4503
+ size_t global_work_size[] = { dst_total_elements, 1, 1 };
4504
+ size_t local_work_size_pref = 256;
4505
+ size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1};
4506
+
4507
+ size_t * local_work_size_ptr = local_work_size;
4508
+ if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) {
4509
+ local_work_size_ptr = nullptr;
4510
+ }
4511
+
4512
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4513
+ }
4514
+
4515
+ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4516
+ GGML_ASSERT(src0);
4517
+ GGML_ASSERT(src0->extra);
4518
+ GGML_ASSERT(src1);
4519
+ GGML_ASSERT(src1->extra);
4520
+ GGML_ASSERT(dst);
4521
+ GGML_ASSERT(dst->extra);
4522
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4523
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
4524
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4525
+
4526
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4527
+ cl_command_queue queue = backend_ctx->queue;
4528
+
4529
+ if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
4530
+ GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
4531
+ return;
4532
+ }
4533
+
4534
+ ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
4535
+ ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
4536
+ ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
4537
+
4538
+ cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
4539
+ cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
4540
+ cl_ulong off_dst = extrad_cl->offset + dst->view_offs;
4541
+
4542
+ const int32_t dim = ((const int32_t *) dst->op_params)[0];
4543
+ GGML_ASSERT(dim >= 0 && dim <= 3);
4544
+
4545
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
4546
+ if (dim == 3) {
4547
+
4548
+ size_t nbytes_src0 = ggml_nbytes(src0);
4549
+ size_t nbytes_src1 = ggml_nbytes(src1);
4550
+
4551
+ CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
4552
+ off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
4553
+ CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
4554
+ off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
4555
+ } else {
4556
+
4557
+ cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
4558
+ size_t global_work_size[3];
4559
+
4560
+ for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
4561
+ cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
4562
+ cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
4563
+ cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
4564
+
4565
+ int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
4566
+ int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
4567
+ int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
4568
+
4569
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
4570
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &current_off_src0));
4571
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
4572
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &current_off_src1));
4573
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
4574
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &current_off_dst));
4575
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
4576
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
4577
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
4578
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
4579
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
4580
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
4581
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
4582
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
4583
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
4584
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
4585
+
4586
+ global_work_size[0] = d_ne0;
4587
+ global_work_size[1] = d_ne1;
4588
+ global_work_size[2] = d_ne2;
4589
+
4590
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4591
+ }
4592
+ }
4593
+ } else {
4594
+ cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
4595
+
4596
+ long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
4597
+ cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
4598
+
4599
+ cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
4600
+
4601
+ long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
4602
+ cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
4603
+
4604
+
4605
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
4606
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4607
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
4608
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
4609
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
4610
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
4611
+
4612
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long), &ne00));
4613
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long), &ne01));
4614
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long), &ne02));
4615
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long), &ne03));
4616
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
4617
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
4618
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
4619
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
4620
+
4621
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
4622
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
4623
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
4624
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
4625
+
4626
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long), &d_ne0));
4627
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long), &d_ne1));
4628
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long), &d_ne2));
4629
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long), &d_ne3));
4630
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
4631
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
4632
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
4633
+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
4634
+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
4635
+
4636
+ size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
4637
+ d_ne2 > 0 ? (size_t)d_ne2 : 1,
4638
+ d_ne3 > 0 ? (size_t)d_ne3 : 1 };
4639
+
4640
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
4641
+ }
4642
+ }
4643
+
4644
+ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
4645
+ GGML_ASSERT(src0);
4646
+ GGML_ASSERT(src0->extra);
4647
+ GGML_ASSERT(dst);
4648
+ GGML_ASSERT(dst->extra);
4649
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4650
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4651
+
4652
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4653
+
4654
+ if (backend_ctx->kernel_timestep_embedding == nullptr) {
4655
+ GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
4656
+ return;
4657
+ }
4658
+
4659
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4660
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4661
+
4662
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4663
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4664
+
4665
+ const int logical_dim = dst->op_params[0];
4666
+ const int max_period = dst->op_params[1];
4667
+ const int dst_nb1_bytes = dst->nb[1];
4668
+
4669
+ cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
4670
+
4671
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4672
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4673
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
4674
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4675
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes));
4676
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim));
4677
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period));
4678
+
4679
+ size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
4680
+
4681
+ size_t gws1 = (size_t)src0->ne[0];
4682
+
4683
+ size_t global_work_size[] = {gws0, gws1, 1};
4684
+
4685
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4109
4686
  }
4110
4687
 
4111
4688
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4120,7 +4697,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4120
4697
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
4121
4698
 
4122
4699
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4123
- cl_command_queue queue = backend_ctx->queue;
4124
4700
 
4125
4701
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4126
4702
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -4325,15 +4901,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4325
4901
  static_cast<size_t>(padded_height_B)
4326
4902
  };
4327
4903
 
4328
- #ifdef GGML_OPENCL_PROFILING
4329
- cl_event evt;
4330
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
4331
-
4332
- g_profiling_info.emplace_back();
4333
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
4334
- #else
4335
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
4336
- #endif
4904
+ backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
4337
4905
  } else {
4338
4906
  // no need to transpose B in other cases
4339
4907
  // create an image for B from sub_buffer
@@ -4455,16 +5023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4455
5023
 
4456
5024
  // enqueue kernel with profiling
4457
5025
  // <--------------------------------------------> //
4458
- #ifdef GGML_OPENCL_PROFILING
4459
- cl_event evt;
4460
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4461
-
4462
- g_profiling_info.emplace_back();
4463
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4464
- // enqueue kernel without profiling
4465
- #else
4466
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4467
- #endif
5026
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4468
5027
  // <--------------------------------------------> //
4469
5028
 
4470
5029
  // deallocate sub buffers and images
@@ -4544,15 +5103,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4544
5103
  global_work_size[2] = (size_t)ne12*ne13;
4545
5104
  }
4546
5105
 
4547
- #ifdef GGML_OPENCL_PROFILING
4548
- cl_event evt;
4549
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4550
-
4551
- g_profiling_info.emplace_back();
4552
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4553
- #else
4554
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4555
- #endif
5106
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4556
5107
  return;
4557
5108
  }
4558
5109
  #else // GGML_OPENCL_SOA_Q
@@ -4782,15 +5333,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4782
5333
  size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
4783
5334
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
4784
5335
 
4785
- #ifdef GGML_OPENCL_PROFILING
4786
- cl_event evt;
4787
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4788
-
4789
- g_profiling_info.emplace_back();
4790
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4791
- #else
4792
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4793
- #endif
5336
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4794
5337
  } else if (src0t == GGML_TYPE_Q4_K) {
4795
5338
  GGML_ASSERT(false && "not implemented");
4796
5339
  } else if (src0t == GGML_TYPE_Q3_K) {
@@ -4801,31 +5344,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4801
5344
  size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
4802
5345
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
4803
5346
 
4804
- #ifdef GGML_OPENCL_PROFILING
4805
- cl_event evt;
4806
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4807
-
4808
- g_profiling_info.emplace_back();
4809
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4810
- #else
4811
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4812
- #endif
5347
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4813
5348
  } else {
4814
5349
  int64_t ny = (ne11 + nrows - 1)/nrows;
4815
5350
 
4816
5351
  size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
4817
5352
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
4818
5353
 
4819
- #ifdef GGML_OPENCL_PROFILING
4820
- cl_event evt;
4821
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5354
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5355
+ }
5356
+ }
4822
5357
 
4823
- g_profiling_info.emplace_back();
4824
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4825
- #else
4826
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5358
+ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5359
+ GGML_ASSERT(src0);
5360
+ GGML_ASSERT(src0->extra);
5361
+ GGML_ASSERT(src1);
5362
+ GGML_ASSERT(src1->extra);
5363
+ GGML_ASSERT(dst);
5364
+ GGML_ASSERT(dst->extra);
5365
+
5366
+ const ggml_tensor * src2 = dst->src[2];
5367
+ GGML_ASSERT(src2);
5368
+ GGML_ASSERT(src2->extra);
5369
+
5370
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5371
+
5372
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5373
+ ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
5374
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5375
+
5376
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
5377
+ cl_ulong offset2 = extra2->offset + src2->view_offs;
5378
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
5379
+
5380
+ #ifdef GGML_OPENCL_SOA_Q
5381
+ ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
4827
5382
  #endif
5383
+
5384
+ const int ne00 = src0->ne[0];
5385
+ const int ne01 = src0->ne[1];
5386
+ const int ne02 = src0->ne[2];
5387
+ const int ne03 = src0->ne[3];
5388
+
5389
+ const cl_ulong nb00 = src0->nb[0];
5390
+ const cl_ulong nb02 = src0->nb[2];
5391
+
5392
+ const int ne10 = src1->ne[0];
5393
+ const int ne11 = src1->ne[1];
5394
+ const int ne12 = src1->ne[2];
5395
+ const int ne13 = src1->ne[3];
5396
+
5397
+ const cl_ulong nb11 = src1->nb[1];
5398
+ const cl_ulong nb12 = src1->nb[2];
5399
+
5400
+ const int ne20 = src2->ne[0];
5401
+ const int ne21 = src2->ne[1];
5402
+
5403
+ const cl_ulong nb21 = src2->nb[1];
5404
+
5405
+ const int ne0 = dst->ne[0];
5406
+ const int ne1 = dst->ne[1];
5407
+
5408
+ const int r2 = ne12/ne02;
5409
+ const int r3 = ne13/ne03;
5410
+ const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
5411
+
5412
+ GGML_ASSERT(ne00 == ne10);
5413
+
5414
+ int sgs = 32; // subgroup size
5415
+ int nsg = 1; // number of subgroups
5416
+ int nrows = 1; // number of row in src1
5417
+ int ndst = 4; // number of values produced by each subgroup
5418
+
5419
+ cl_kernel kernel;
5420
+
5421
+ // subgroup mat vec
5422
+ switch (src0->type) {
5423
+ case GGML_TYPE_Q4_0: {
5424
+ kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
5425
+
5426
+ if (backend_ctx->gpu_family == INTEL) {
5427
+ sgs = 16;
5428
+ nsg = 1;
5429
+ ndst = 8;
5430
+ } else if (backend_ctx->gpu_family == ADRENO) {
5431
+ sgs = 64;
5432
+ nsg = 1;
5433
+ ndst = 8;
5434
+ } else {
5435
+ GGML_ASSERT(false && "TODO: Unknown GPU");
5436
+ }
5437
+
5438
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
5439
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
5440
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5441
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5442
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
5443
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
5444
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
5445
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
5446
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
5447
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
5448
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
5449
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
5450
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
5451
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
5452
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
5453
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
5454
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
5455
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
5456
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
5457
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
5458
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
5459
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
5460
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
5461
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
5462
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
5463
+
5464
+ break;
5465
+ }
5466
+ default:
5467
+ GGML_ASSERT(false && "not implemented");;
4828
5468
  }
5469
+
5470
+ int _ne1 = 1;
5471
+ int ne123 = dst_rows;
5472
+
5473
+ size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
5474
+ size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
5475
+
5476
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4829
5477
  }
4830
5478
 
4831
5479
  static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4838,7 +5486,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
4838
5486
  GGML_ASSERT(ggml_is_contiguous(src0));
4839
5487
 
4840
5488
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4841
- cl_command_queue queue = backend_ctx->queue;
4842
5489
 
4843
5490
  float scale;
4844
5491
  memcpy(&scale, dst->op_params, sizeof(scale));
@@ -4867,15 +5514,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
4867
5514
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4868
5515
  }
4869
5516
 
4870
- #ifdef GGML_OPENCL_PROFILING
4871
- cl_event evt;
4872
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4873
-
4874
- g_profiling_info.emplace_back();
4875
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4876
- #else
4877
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4878
- #endif
5517
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
4879
5518
  }
4880
5519
 
4881
5520
  static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4912,7 +5551,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
4912
5551
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
4913
5552
 
4914
5553
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4915
- cl_command_queue queue = backend_ctx->queue;
4916
5554
 
4917
5555
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4918
5556
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -4977,15 +5615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
4977
5615
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
4978
5616
  size_t local_work_size[] = {(size_t)nth, 1, 1};
4979
5617
 
4980
- #ifdef GGML_OPENCL_PROFILING
4981
- cl_event evt;
4982
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4983
-
4984
- g_profiling_info.emplace_back();
4985
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
4986
- #else
4987
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4988
- #endif
5618
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
4989
5619
  }
4990
5620
 
4991
5621
  static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5008,7 +5638,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5008
5638
  const int ne02 = src0 ? src0->ne[2] : 0;
5009
5639
 
5010
5640
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5011
- cl_command_queue queue = backend_ctx->queue;
5012
5641
 
5013
5642
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5014
5643
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5032,15 +5661,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5032
5661
  size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
5033
5662
  size_t local_work_size[] = {64, 1, 1};
5034
5663
 
5035
- #ifdef GGML_OPENCL_PROFILING
5036
- cl_event evt;
5037
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5038
-
5039
- g_profiling_info.emplace_back();
5040
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5041
- #else
5042
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5043
- #endif
5664
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5044
5665
  } else {
5045
5666
  kernel = backend_ctx->kernel_diag_mask_inf;
5046
5667
 
@@ -5060,15 +5681,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
5060
5681
  local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5061
5682
  }
5062
5683
 
5063
- #ifdef GGML_OPENCL_PROFILING
5064
- cl_event evt;
5065
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
5066
-
5067
- g_profiling_info.emplace_back();
5068
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
5069
- #else
5070
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
5071
- #endif
5684
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
5072
5685
  }
5073
5686
  }
5074
5687
 
@@ -5088,7 +5701,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
5088
5701
  }
5089
5702
 
5090
5703
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5091
- cl_command_queue queue = backend_ctx->queue;
5092
5704
 
5093
5705
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5094
5706
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5168,15 +5780,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
5168
5780
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5169
5781
  size_t local_work_size[] = {(size_t)nth, 1, 1};
5170
5782
 
5171
- #ifdef GGML_OPENCL_PROFILING
5172
- cl_event evt;
5173
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5174
-
5175
- g_profiling_info.emplace_back();
5176
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5177
- #else
5178
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5179
- #endif
5783
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5180
5784
  }
5181
5785
 
5182
5786
  static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5188,7 +5792,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
5188
5792
  GGML_ASSERT(dst->extra);
5189
5793
 
5190
5794
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5191
- cl_command_queue queue = backend_ctx->queue;
5192
5795
 
5193
5796
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5194
5797
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -5354,15 +5957,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
5354
5957
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5355
5958
  size_t local_work_size[] = {(size_t)nth, 1, 1};
5356
5959
 
5357
- #ifdef GGML_OPENCL_PROFILING
5358
- cl_event evt;
5359
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5360
-
5361
- g_profiling_info.emplace_back();
5362
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5363
- #else
5364
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5365
- #endif
5960
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5366
5961
  }
5367
5962
 
5368
5963
  static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5377,7 +5972,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
5377
5972
  GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
5378
5973
 
5379
5974
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5380
- cl_command_queue queue = backend_ctx->queue;
5381
5975
 
5382
5976
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5383
5977
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5446,15 +6040,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
5446
6040
  size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
5447
6041
  size_t local_work_size[] = {256, 1, 1};
5448
6042
 
5449
- #ifdef GGML_OPENCL_PROFILING
5450
- cl_event evt;
5451
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5452
-
5453
- g_profiling_info.emplace_back();
5454
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5455
- #else
5456
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5457
- #endif
6043
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5458
6044
  }
5459
6045
 
5460
6046
  static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5469,7 +6055,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
5469
6055
  GGML_ASSERT(ggml_is_contiguous(src0));
5470
6056
 
5471
6057
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5472
- cl_command_queue queue = backend_ctx->queue;
5473
6058
 
5474
6059
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5475
6060
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5501,15 +6086,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
5501
6086
  size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
5502
6087
  size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
5503
6088
 
5504
- #ifdef GGML_OPENCL_PROFILING
5505
- cl_event evt;
5506
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5507
-
5508
- g_profiling_info.emplace_back();
5509
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5510
- #else
5511
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5512
- #endif
6089
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5513
6090
  }
5514
6091
 
5515
6092
  static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5523,7 +6100,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
5523
6100
  GGML_ASSERT(ggml_is_contiguous(src0));
5524
6101
 
5525
6102
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5526
- cl_command_queue queue = backend_ctx->queue;
5527
6103
 
5528
6104
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5529
6105
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5564,15 +6140,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
5564
6140
  size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
5565
6141
  size_t local_work_size[] = {(size_t)64, 1, 1};
5566
6142
 
5567
- #ifdef GGML_OPENCL_PROFILING
5568
- cl_event evt;
5569
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5570
-
5571
- g_profiling_info.emplace_back();
5572
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5573
- #else
5574
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5575
- #endif
6143
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5576
6144
  }
5577
6145
 
5578
6146
  //------------------------------------------------------------------------------
@@ -5667,6 +6235,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5667
6235
  }
5668
6236
  func = ggml_cl_sigmoid;
5669
6237
  break;
6238
+ case GGML_UNARY_OP_TANH:
6239
+ if (!any_on_device) {
6240
+ return false;
6241
+ }
6242
+ func = ggml_cl_tanh;
6243
+ break;
5670
6244
  default:
5671
6245
  return false;
5672
6246
  } break;
@@ -5694,12 +6268,48 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5694
6268
  }
5695
6269
  func = ggml_cl_group_norm;
5696
6270
  break;
6271
+ case GGML_OP_REPEAT:
6272
+ if (!any_on_device) {
6273
+ return false;
6274
+ }
6275
+ func = ggml_cl_repeat;
6276
+ break;
6277
+ case GGML_OP_PAD:
6278
+ if (!any_on_device) {
6279
+ return false;
6280
+ }
6281
+ ggml_cl_pad(backend, tensor->src[0], tensor);
6282
+ return true;
6283
+ case GGML_OP_UPSCALE:
6284
+ if (!any_on_device) {
6285
+ return false;
6286
+ }
6287
+ ggml_cl_upscale(backend, tensor->src[0], tensor);
6288
+ return true;
6289
+ case GGML_OP_CONCAT:
6290
+ if (!any_on_device) {
6291
+ return false;
6292
+ }
6293
+ func = ggml_cl_concat;
6294
+ break;
6295
+ case GGML_OP_TIMESTEP_EMBEDDING:
6296
+ if (!any_on_device) {
6297
+ return false;
6298
+ }
6299
+ ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
6300
+ return true;
5697
6301
  case GGML_OP_MUL_MAT:
5698
6302
  if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
5699
6303
  return false;
5700
6304
  }
5701
6305
  func = ggml_cl_mul_mat;
5702
6306
  break;
6307
+ case GGML_OP_MUL_MAT_ID:
6308
+ if (!any_on_device) {
6309
+ return false;
6310
+ }
6311
+ func = ggml_cl_mul_mat_id;
6312
+ break;
5703
6313
  case GGML_OP_SCALE:
5704
6314
  if (!any_on_device) {
5705
6315
  return false;