@novastera-oss/llamarn 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/build-info.cpp +2 -2
  14. package/cpp/llama.cpp/README.md +11 -3
  15. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  17. package/cpp/llama.cpp/common/arg.cpp +153 -113
  18. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  19. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  20. package/cpp/llama.cpp/common/chat.cpp +847 -699
  21. package/cpp/llama.cpp/common/chat.h +73 -6
  22. package/cpp/llama.cpp/common/common.cpp +50 -82
  23. package/cpp/llama.cpp/common/common.h +21 -17
  24. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  25. package/cpp/llama.cpp/common/json-partial.h +37 -0
  26. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  27. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  28. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  29. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  30. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  31. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  32. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  33. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  34. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  35. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  36. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  37. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  74. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  120. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  121. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  122. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  123. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  124. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  125. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  126. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  127. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  128. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  129. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  130. package/cpp/llama.cpp/include/llama.h +62 -125
  131. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  132. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  133. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  134. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  135. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  150. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  152. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  154. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  159. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  160. package/cpp/llama.cpp/models/templates/README.md +2 -0
  161. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  162. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  163. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  164. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  165. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  166. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  167. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  168. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  169. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  170. package/cpp/llama.cpp/src/llama-context.h +30 -0
  171. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  172. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  173. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  174. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  175. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  176. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  177. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  178. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  179. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  180. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  181. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  182. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  183. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  184. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  185. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  186. package/cpp/llama.cpp/src/llama-model.h +6 -1
  187. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  188. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  189. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  190. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  191. package/cpp/llama.cpp/src/llama.cpp +14 -0
  192. package/cpp/rn-completion.cpp +4 -2
  193. package/ios/include/chat.h +73 -6
  194. package/ios/include/common/minja/chat-template.hpp +9 -5
  195. package/ios/include/common/minja/minja.hpp +69 -36
  196. package/ios/include/common.h +21 -17
  197. package/ios/include/llama.h +62 -125
  198. package/ios/libs/llama.xcframework/Info.plist +19 -19
  199. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  200. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  201. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  202. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  203. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  204. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  205. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  206. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  207. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  208. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  227. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  228. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  229. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  231. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  232. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  235. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  236. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  240. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  241. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  242. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  243. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  244. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  245. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  246. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  247. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  248. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  249. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  253. package/package.json +1 -1
  254. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  255. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  256. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  257. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  258. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  259. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  260. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  261. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  262. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  263. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -27,6 +27,7 @@
27
27
  #include <cmath>
28
28
  #include <memory>
29
29
  #include <charconv>
30
+ #include <mutex>
30
31
 
31
32
  #undef MIN
32
33
  #undef MAX
@@ -74,6 +75,7 @@ struct ggml_cl_version {
74
75
  cl_uint minor = 0;
75
76
  };
76
77
 
78
+
77
79
  struct ggml_cl_compiler_version {
78
80
  ADRENO_CL_COMPILER_TYPE type;
79
81
  int major = -1;
@@ -91,6 +93,14 @@ struct ggml_cl_compiler_version {
91
93
  }
92
94
  };
93
95
 
96
+ static size_t align_to(size_t value, size_t to_alignment) {
97
+ GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
98
+ GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
99
+
100
+ return ((value + to_alignment - 1) / to_alignment) * to_alignment;
101
+ }
102
+
103
+
94
104
  // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
95
105
  static ggml_cl_version parse_cl_version(std::string_view str) {
96
106
  size_t major_str_begin = 0;
@@ -221,13 +231,25 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
221
231
  return { type, major, minor, patch };
222
232
  }
223
233
 
234
+ struct ggml_backend_opencl_context;
235
+
224
236
  // backend device context
225
237
  struct ggml_backend_opencl_device_context {
226
238
  cl_platform_id platform;
227
239
  std::string platform_name;
228
240
 
229
- cl_device_id device;
230
- std::string device_name;
241
+ cl_device_id device;
242
+ std::string device_name;
243
+ cl_device_type device_type;
244
+ std::string device_version;
245
+
246
+ // Initialized by ggml_cl2_init().
247
+ ggml_backend_opencl_context * backend_ctx = nullptr;
248
+
249
+ // Initialized by ggml_backend_opencl_device_get_buffer_type()
250
+ ggml_backend_buffer_type buffer_type;
251
+
252
+ cl_context context = nullptr;
231
253
  };
232
254
 
233
255
  // backend context
@@ -248,6 +270,8 @@ struct ggml_backend_opencl_context {
248
270
 
249
271
  int adreno_wave_size;
250
272
 
273
+ cl_bool non_uniform_workgroups;
274
+
251
275
  cl_context context;
252
276
  cl_command_queue queue;
253
277
 
@@ -275,27 +299,37 @@ struct ggml_backend_opencl_context {
275
299
  cl_program program_mul_mv_f16_f32;
276
300
  cl_program program_mul_mv_f32_f32;
277
301
  cl_program program_mul;
302
+ cl_program program_div;
303
+ cl_program program_sub;
278
304
  cl_program program_norm;
279
305
  cl_program program_relu;
280
306
  cl_program program_rms_norm;
307
+ cl_program program_group_norm;
281
308
  cl_program program_rope;
282
309
  cl_program program_scale;
283
310
  cl_program program_silu;
311
+ cl_program program_sigmoid;
284
312
  cl_program program_softmax_f32;
285
313
  cl_program program_softmax_f16;
286
314
  cl_program program_softmax_4_f32;
287
315
  cl_program program_softmax_4_f16;
316
+ cl_program program_argsort_f32_i32;
317
+ cl_program program_sum_rows_f32;
288
318
 
289
319
  cl_kernel kernel_add, kernel_add_row;
290
320
  cl_kernel kernel_mul, kernel_mul_row;
321
+ cl_kernel kernel_div, kernel_div_row;
322
+ cl_kernel kernel_sub, kernel_sub_row;
291
323
  cl_kernel kernel_scale;
292
324
  cl_kernel kernel_silu, kernel_silu_4;
293
325
  cl_kernel kernel_gelu, kernel_gelu_4;
294
326
  cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
295
327
  cl_kernel kernel_relu;
328
+ cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
296
329
  cl_kernel kernel_clamp;
297
330
  cl_kernel kernel_norm;
298
331
  cl_kernel kernel_rms_norm;
332
+ cl_kernel kernel_group_norm;
299
333
  cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
300
334
  cl_kernel kernel_soft_max, kernel_soft_max_4;
301
335
  cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
@@ -315,6 +349,8 @@ struct ggml_backend_opencl_context {
315
349
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
316
350
  cl_kernel kernel_mul_mv_q6_K_f32;
317
351
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
352
+ cl_kernel kernel_argsort_f32_i32;
353
+ cl_kernel kernel_sum_rows_f32;
318
354
 
319
355
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
320
356
  // Transpose kernels
@@ -344,15 +380,8 @@ struct ggml_backend_opencl_context {
344
380
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
345
381
  };
346
382
 
347
- static ggml_backend_device g_ggml_backend_opencl_device;
348
- static ggml_backend_opencl_device_context g_ggml_ctx_dev_main {
349
- /*.platform =*/ nullptr,
350
- /*.platform_nane =*/ "",
351
- /*.device =*/ nullptr,
352
- /*.device_name =*/ "",
353
- };
354
-
355
- static int ggml_backend_opencl_n_devices = 0;
383
+ // All registered devices with a default device in the front.
384
+ static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
356
385
 
357
386
  // Profiling
358
387
  #ifdef GGML_OPENCL_PROFILING
@@ -969,6 +998,105 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
969
998
  GGML_LOG_CONT(".");
970
999
  }
971
1000
 
1001
+ // argsort
1002
+ {
1003
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1004
+ const std::string kernel_src {
1005
+ #include "argsort.cl.h"
1006
+ };
1007
+ #else
1008
+ const std::string kernel_src = read_file("argsort.cl");
1009
+ #endif
1010
+ backend_ctx->program_argsort_f32_i32 =
1011
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1012
+
1013
+ CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
1014
+ GGML_LOG_CONT(".");
1015
+ }
1016
+
1017
+ // div
1018
+ {
1019
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1020
+ const std::string kernel_src {
1021
+ #include "div.cl.h"
1022
+ };
1023
+ #else
1024
+ const std::string kernel_src = read_file("div.cl");
1025
+ #endif
1026
+ backend_ctx->program_div =
1027
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1028
+
1029
+ CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
1030
+ CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
1031
+ GGML_LOG_CONT(".");
1032
+ }
1033
+
1034
+ // sub
1035
+ {
1036
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1037
+ const std::string kernel_src {
1038
+ #include "sub.cl.h"
1039
+ };
1040
+ #else
1041
+ const std::string kernel_src = read_file("sub.cl");
1042
+ #endif
1043
+ backend_ctx->program_sub =
1044
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1045
+
1046
+ CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
1047
+ CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
1048
+ GGML_LOG_CONT(".");
1049
+ }
1050
+
1051
+ // sum_rows
1052
+ {
1053
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1054
+ const std::string kernel_src {
1055
+ #include "sum_rows.cl.h"
1056
+ };
1057
+ #else
1058
+ const std::string kernel_src = read_file("sum_rows.cl");
1059
+ #endif
1060
+ backend_ctx->program_sum_rows_f32 =
1061
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1062
+
1063
+ CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
1064
+ GGML_LOG_CONT(".");
1065
+ }
1066
+
1067
+ // sigmoid
1068
+ {
1069
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1070
+ const std::string kernel_src {
1071
+ #include "sigmoid.cl.h"
1072
+ };
1073
+ #else
1074
+ const std::string kernel_src = read_file("sigmoid.cl");
1075
+ #endif
1076
+ backend_ctx->program_sigmoid =
1077
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1078
+
1079
+ CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
1080
+ CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
1081
+ GGML_LOG_CONT(".");
1082
+ }
1083
+
1084
+ // group_norm
1085
+ {
1086
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1087
+ const std::string kernel_src {
1088
+ #include "group_norm.cl.h"
1089
+ };
1090
+ #else
1091
+ const std::string kernel_src = read_file("group_norm.cl");
1092
+ #endif
1093
+ backend_ctx->program_group_norm =
1094
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1095
+
1096
+ CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
1097
+ GGML_LOG_CONT(".");
1098
+ }
1099
+
972
1100
  // Adreno kernels
973
1101
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
974
1102
  // transpose
@@ -1107,25 +1235,19 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1107
1235
  GGML_LOG_CONT("\n");
1108
1236
  }
1109
1237
 
1110
- static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1111
- static bool initialized = false;
1112
- static ggml_backend_opencl_context *backend_ctx = nullptr;
1113
-
1114
- if (initialized) {
1115
- return backend_ctx;
1116
- }
1238
+ // XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1239
+ // XXX static bool initialized = false;
1240
+ // XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
1117
1241
 
1118
- ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
1119
- GGML_ASSERT(dev_ctx);
1120
- GGML_ASSERT(dev_ctx->platform == nullptr);
1121
- GGML_ASSERT(dev_ctx->device == nullptr);
1122
- GGML_ASSERT(backend_ctx == nullptr);
1242
+ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
1123
1243
 
1124
- initialized = true;
1125
- backend_ctx = new ggml_backend_opencl_context();
1126
- backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1244
+ namespace /* anonymous */ {
1245
+ extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
1246
+ }
1127
1247
 
1128
- cl_int err;
1248
+ // Look for available and suitable devices.
1249
+ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_reg * reg) {
1250
+ std::vector<ggml_backend_device> found_devices;
1129
1251
 
1130
1252
  #ifdef GGML_OPENCL_PROFILING
1131
1253
  GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
@@ -1158,11 +1280,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1158
1280
  struct cl_device devices[NDEV];
1159
1281
  unsigned n_devices = 0;
1160
1282
  struct cl_device * default_device = NULL;
1283
+ unsigned default_platform_number = 0;
1161
1284
 
1162
1285
  cl_platform_id platform_ids[NPLAT];
1163
1286
  if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
1164
1287
  GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
1165
- return backend_ctx;
1288
+ return found_devices;
1166
1289
  }
1167
1290
 
1168
1291
  for (unsigned i = 0; i < n_platforms; i++) {
@@ -1197,19 +1320,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1197
1320
  }
1198
1321
 
1199
1322
  if (default_device == NULL && p->default_device != NULL) {
1200
- default_device = p->default_device;
1323
+ default_device = p->default_device;
1324
+ default_platform_number = i;
1201
1325
  }
1202
1326
  }
1203
1327
 
1204
1328
  if (n_devices == 0) {
1205
1329
  GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
1206
- return backend_ctx;
1330
+ return found_devices;
1207
1331
  }
1208
1332
 
1209
- char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
1210
- char * user_device_string = getenv("GGML_OPENCL_DEVICE");
1211
- int user_platform_number = -1;
1212
- int user_device_number = -1;
1333
+ char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
1334
+ char * user_device_string = getenv("GGML_OPENCL_DEVICE");
1335
+ int user_platform_number = -1;
1336
+ int user_device_number = -1;
1337
+ cl_device * candidate_devices = nullptr;
1338
+ unsigned n_candidate_devices = 0;
1213
1339
 
1214
1340
  unsigned n;
1215
1341
  if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
@@ -1224,12 +1350,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1224
1350
  GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
1225
1351
  exit(1);
1226
1352
  }
1227
- default_device = &platform->devices[user_device_number];
1353
+ default_device = &platform->devices[user_device_number];
1354
+ candidate_devices = platform->devices;
1355
+ n_candidate_devices = platform->n_devices;
1228
1356
  } else {
1229
-
1230
- struct cl_device * selected_devices = devices;
1231
- unsigned n_selected_devices = n_devices;
1232
-
1357
+ // Choose a platform by matching a substring.
1233
1358
  if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
1234
1359
  for (unsigned i = 0; i < n_platforms; i++) {
1235
1360
  struct cl_platform * p = &platforms[i];
@@ -1244,20 +1369,20 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1244
1369
  exit(1);
1245
1370
  }
1246
1371
  }
1247
- if (user_platform_number != -1) {
1248
- struct cl_platform * p = &platforms[user_platform_number];
1249
- selected_devices = p->devices;
1250
- n_selected_devices = p->n_devices;
1251
- default_device = p->default_device;
1252
- if (n_selected_devices == 0) {
1253
- GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
1254
- exit(1);
1255
- }
1372
+
1373
+ int platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number;
1374
+ struct cl_platform * p = &platforms[platform_idx];
1375
+ candidate_devices = p->devices;
1376
+ n_candidate_devices = p->n_devices;
1377
+ default_device = p->default_device;
1378
+ if (n_candidate_devices == 0) {
1379
+ GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
1380
+ exit(1);
1256
1381
  }
1257
1382
 
1258
1383
  if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
1259
- for (unsigned i = 0; i < n_selected_devices; i++) {
1260
- struct cl_device * d = &selected_devices[i];
1384
+ for (unsigned i = 0; i < n_candidate_devices; i++) {
1385
+ struct cl_device * d = &candidate_devices[i];
1261
1386
  if (strstr(d->name, user_device_string) != NULL) {
1262
1387
  user_device_number = d->number;
1263
1388
  break;
@@ -1269,71 +1394,145 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1269
1394
  }
1270
1395
  }
1271
1396
  if (user_device_number != -1) {
1272
- selected_devices = &devices[user_device_number];
1273
- n_selected_devices = 1;
1274
- default_device = &selected_devices[0];
1397
+ candidate_devices = &devices[user_device_number];
1398
+ n_candidate_devices = 1;
1399
+ default_device = &candidate_devices[0];
1275
1400
  }
1276
1401
 
1277
- GGML_ASSERT(n_selected_devices > 0);
1402
+ GGML_ASSERT(n_candidate_devices > 0);
1278
1403
 
1279
1404
  if (default_device == NULL) {
1280
- default_device = &selected_devices[0];
1405
+ default_device = &candidate_devices[0];
1406
+ }
1407
+ }
1408
+
1409
+ GGML_ASSERT(n_candidate_devices != 0 && candidate_devices);
1410
+
1411
+ // Put the default device in front.
1412
+ for (unsigned i = 1; i < n_candidate_devices; i++) {
1413
+ if (&candidate_devices[i] == default_device) {
1414
+ std::swap(candidate_devices[0], candidate_devices[i]);
1415
+ default_device = &candidate_devices[0];
1416
+ break;
1417
+ }
1418
+ }
1419
+
1420
+ GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name);
1421
+
1422
+ std::vector<cl_device_id> device_ids;
1423
+ for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
1424
+ device_ids.push_back(dev->id);
1425
+ }
1426
+
1427
+ cl_int err;
1428
+ cl_context shared_context;
1429
+ cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 };
1430
+
1431
+ CL_CHECK(
1432
+ (shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err));
1433
+
1434
+ for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
1435
+ GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version);
1436
+
1437
+ auto dev_ctx = std::unique_ptr<ggml_backend_opencl_device_context>(new ggml_backend_opencl_device_context{
1438
+ /*.platform =*/dev->platform->id,
1439
+ /*.platform_nane =*/dev->platform->name,
1440
+ /*.device =*/dev->id,
1441
+ /*.device_name =*/dev->name,
1442
+ /*.device_type =*/dev->type,
1443
+ /*.device_version =*/dev->version,
1444
+ /*.backend_ctx =*/nullptr,
1445
+ /*.buffer_type =*/{},
1446
+ /*.context =*/shared_context,
1447
+ });
1448
+
1449
+ found_devices.push_back(ggml_backend_device{
1450
+ /* .iface = */ ggml_backend_opencl_device_i,
1451
+ /* .reg = */ reg,
1452
+ /* .context = */ dev_ctx.get(),
1453
+ });
1454
+
1455
+ if (!ggml_cl2_init(&found_devices.back())) {
1456
+ found_devices.pop_back();
1457
+ GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
1458
+ continue;
1459
+ }
1460
+
1461
+ dev_ctx.release();
1462
+ }
1463
+
1464
+ if (found_devices.size()) {
1465
+ auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(found_devices.front().context);
1466
+ GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(),
1467
+ dev_ctx->device_version.c_str());
1468
+
1469
+ if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) {
1470
+ GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n",
1471
+ dev_ctx->device_name.c_str());
1281
1472
  }
1282
1473
  }
1283
1474
 
1284
- GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
1285
- GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
1286
- if (default_device->type != CL_DEVICE_TYPE_GPU) {
1287
- GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
1475
+ return found_devices;
1476
+ }
1477
+
1478
+ // Initialize device if it is supported (returns nullptr if it is not).
1479
+ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1480
+ GGML_ASSERT(dev);
1481
+ GGML_ASSERT(dev->context);
1482
+
1483
+ ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
1484
+ GGML_ASSERT(dev_ctx->platform);
1485
+ GGML_ASSERT(dev_ctx->device);
1486
+
1487
+ if (dev_ctx->backend_ctx) {
1488
+ return dev_ctx->backend_ctx;
1288
1489
  }
1289
1490
 
1290
- dev_ctx->platform = default_device->platform->id;
1291
- dev_ctx->device = default_device->id;
1292
- backend_ctx->device = default_device->id;
1491
+ auto backend_ctx = std::make_unique<ggml_backend_opencl_context>();
1492
+ backend_ctx->device = dev_ctx->device;
1493
+ backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1293
1494
 
1294
- if (strstr(default_device->name, "Adreno") ||
1295
- strstr(default_device->name, "Qualcomm") ||
1296
- strstr(default_device->version, "Adreno")) {
1495
+ if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
1496
+ strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
1497
+ strstr(dev_ctx->device_version.c_str(), "Adreno")) {
1297
1498
  backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
1298
1499
  // Usually device version contains the detailed device name
1299
- backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
1500
+ backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
1300
1501
  if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
1301
- backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
1502
+ backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
1302
1503
  }
1303
1504
 
1304
1505
  // Use wave size of 64 for all Adreno GPUs.
1305
1506
  backend_ctx->adreno_wave_size = 64;
1306
- } else if (strstr(default_device->name, "Intel")) {
1507
+ } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
1307
1508
  backend_ctx->gpu_family = GPU_FAMILY::INTEL;
1308
1509
  } else {
1309
- GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
1510
+ GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
1310
1511
  backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1311
- return backend_ctx;
1512
+ return nullptr;
1312
1513
  }
1313
1514
 
1314
1515
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1315
1516
  if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
1316
1517
  GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
1317
1518
  "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
1318
- return backend_ctx;
1519
+ return nullptr;
1319
1520
  }
1320
1521
  #endif
1321
1522
 
1322
1523
  // Populate backend device name
1323
- dev_ctx->platform_name = default_device->platform->name;
1324
- dev_ctx->device_name = default_device->name;
1325
- backend_ctx->device_name = default_device->name;
1524
+ backend_ctx->device_name = dev_ctx->device_name;
1326
1525
 
1327
1526
  // A local ref of cl_device_id for convenience
1328
1527
  cl_device_id device = backend_ctx->device;
1329
1528
 
1330
- ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
1529
+ ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
1331
1530
 
1332
1531
  // Check device OpenCL version, OpenCL 2.0 or above is required
1333
1532
  ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
1334
1533
  if (opencl_c_version.major < 2) {
1335
1534
  GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
1336
- return backend_ctx;
1535
+ return nullptr;
1337
1536
  }
1338
1537
 
1339
1538
  // Check driver version
@@ -1364,7 +1563,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1364
1563
  // fp16 is required
1365
1564
  if (!backend_ctx->fp16_support) {
1366
1565
  GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
1367
- return backend_ctx;
1566
+ return nullptr;
1368
1567
  }
1369
1568
 
1370
1569
  // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
@@ -1373,7 +1572,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1373
1572
  strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
1374
1573
  GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
1375
1574
  "(note that subgroups is an optional feature in OpenCL 3.0)\n");
1376
- return backend_ctx;
1575
+ return nullptr;
1377
1576
  }
1378
1577
 
1379
1578
  cl_uint base_align_in_bits;
@@ -1397,6 +1596,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1397
1596
  GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
1398
1597
  svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
1399
1598
 
1599
+ if (opencl_c_version.major >= 3) {
1600
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
1601
+ &backend_ctx->non_uniform_workgroups, 0));
1602
+ } else {
1603
+ GGML_ASSERT(opencl_c_version.major == 2);
1604
+ // Non-uniform workgroup sizes is mandatory feature in v2.x.
1605
+ backend_ctx->non_uniform_workgroups = true;
1606
+ }
1607
+
1400
1608
  // Print out configurations
1401
1609
  #ifdef GGML_OPENCL_SOA_Q
1402
1610
  GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
@@ -1406,14 +1614,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1406
1614
  GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
1407
1615
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1408
1616
 
1409
- cl_context_properties properties[] = {
1410
- (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
1411
- };
1412
-
1413
- CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
1617
+ cl_int err;
1414
1618
 
1415
1619
  // A local ref of cl_context for convenience
1416
- cl_context context = backend_ctx->context;
1620
+ cl_context context = backend_ctx->context = dev_ctx->context;
1417
1621
 
1418
1622
  //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
1419
1623
  // (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
@@ -1426,7 +1630,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1426
1630
  CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
1427
1631
 
1428
1632
  // Load kernels
1429
- load_cl_kernels(backend_ctx, opencl_c_version);
1633
+ load_cl_kernels(backend_ctx.get(), opencl_c_version);
1430
1634
 
1431
1635
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1432
1636
  // Allocate intermediate buffers and images
@@ -1456,10 +1660,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1456
1660
  CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
1457
1661
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1458
1662
 
1459
- // For now we support a single devices
1460
- ggml_backend_opencl_n_devices = 1;
1461
-
1462
- return backend_ctx;
1663
+ dev_ctx->backend_ctx = backend_ctx.release();
1664
+ return dev_ctx->backend_ctx;
1463
1665
  }
1464
1666
 
1465
1667
  static void ggml_cl2_free(void) {
@@ -1664,10 +1866,46 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
1664
1866
  GGML_UNUSED(backend);
1665
1867
  }
1666
1868
 
1869
+ // Syncronizes the 'backend_ctx's device with others so that commands
1870
+ // enqueued to it won't start until commands in the other devices have
1871
+ // completed.
1872
+ static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
1873
+ if (g_ggml_backend_opencl_devices.size() < 2)
1874
+ return; // No other devices to synchronize with.
1875
+
1876
+ std::vector<cl_event> events;
1877
+ events.reserve(g_ggml_backend_opencl_devices.size());
1878
+
1879
+ for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
1880
+ auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
1881
+ if (backend_ctx != other_backend_ctx) {
1882
+ cl_event ev;
1883
+ CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
1884
+ CL_CHECK(clFlush(other_backend_ctx->queue));
1885
+ events.push_back(ev);
1886
+ }
1887
+ }
1888
+
1889
+ CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr));
1890
+ for (auto ev : events) {
1891
+ CL_CHECK(clReleaseEvent(ev));
1892
+ }
1893
+ }
1894
+
1895
+ static void sync_with_other_backends(ggml_backend_t backend) {
1896
+ auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
1897
+ sync_with_other_backends(backend_ctx);
1898
+ }
1899
+
1667
1900
  static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
1668
1901
  for (int i = 0; i < cgraph->n_nodes; i++) {
1669
1902
  ggml_tensor * node = cgraph->nodes[i];
1670
1903
 
1904
+ // NOTE: this may oversynchronize by synchronizing with
1905
+ // backends/devices which don't compute 'cgraph's
1906
+ // dependencies.
1907
+ sync_with_other_backends(backend);
1908
+
1671
1909
  if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
1672
1910
  continue;
1673
1911
  }
@@ -1729,6 +1967,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1729
1967
  case GGML_OP_ADD:
1730
1968
  case GGML_OP_SCALE:
1731
1969
  case GGML_OP_MUL:
1970
+ case GGML_OP_DIV:
1971
+ case GGML_OP_SUB:
1732
1972
  return op->src[0]->type == GGML_TYPE_F32;
1733
1973
  case GGML_OP_UNARY:
1734
1974
  switch (ggml_get_unary_op(op)) {
@@ -1736,7 +1976,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1736
1976
  case GGML_UNARY_OP_SILU:
1737
1977
  case GGML_UNARY_OP_RELU:
1738
1978
  case GGML_UNARY_OP_GELU_QUICK:
1739
- return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1979
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1980
+ case GGML_UNARY_OP_SIGMOID:
1981
+ return ggml_is_contiguous(op->src[0]);
1740
1982
  default:
1741
1983
  return false;
1742
1984
  }
@@ -1746,11 +1988,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1746
1988
  case GGML_OP_NORM:
1747
1989
  case GGML_OP_RMS_NORM:
1748
1990
  return true;
1991
+ case GGML_OP_GROUP_NORM:
1992
+ return ggml_is_contiguous(op->src[0]);
1749
1993
  case GGML_OP_MUL_MAT:
1750
1994
  if (op->src[0]->type == GGML_TYPE_F16) {
1751
1995
  return true;
1752
1996
  } else if (op->src[0]->type == GGML_TYPE_F32) {
1753
- return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
1997
+ return op->src[1]->type == GGML_TYPE_F32;
1754
1998
  } else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
1755
1999
  op->src[0]->type == GGML_TYPE_Q6_K) {
1756
2000
  return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
@@ -1785,6 +2029,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1785
2029
  }
1786
2030
  case GGML_OP_IM2COL:
1787
2031
  return true;
2032
+ case GGML_OP_ARGSORT:
2033
+ return op->src[0]->type == GGML_TYPE_F32;
2034
+ case GGML_OP_SUM_ROWS:
2035
+ return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
1788
2036
  default:
1789
2037
  return false;
1790
2038
  }
@@ -2058,15 +2306,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
2058
2306
  // The original tensor memory is divided into scales and quants, i.e.,
2059
2307
  // we first store scales, then quants.
2060
2308
  // Create subbuffer for scales.
2061
- region.origin = extra_orig->offset + tensor->view_offs + offset;
2309
+ region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
2062
2310
  region.size = size_d;
2063
2311
  extra->d = clCreateSubBuffer(
2064
2312
  extra_orig->data_device, CL_MEM_READ_WRITE,
2065
2313
  CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
2066
2314
  CL_CHECK(err);
2315
+ auto previous_origin = region.origin;
2067
2316
 
2068
2317
  // Create subbuffer for quants.
2069
- region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
2318
+ region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
2070
2319
  region.size = size_q;
2071
2320
  extra->q = clCreateSubBuffer(
2072
2321
  extra_orig->data_device, CL_MEM_READ_WRITE,
@@ -2271,8 +2520,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
2271
2520
  cl_context context = backend_ctx->context;
2272
2521
  cl_command_queue queue = backend_ctx->queue;
2273
2522
 
2274
- // Make sure all previously submitted commands are finished.
2275
- CL_CHECK(clFinish(queue));
2523
+ // Make sure all previously submitted commands in other devices are finished.
2524
+ sync_with_other_backends(backend_ctx);
2276
2525
 
2277
2526
  #ifdef GGML_OPENCL_SOA_Q
2278
2527
  // In end-to-end runs, get_tensor is usually used to get back the logits,
@@ -2376,13 +2625,8 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
2376
2625
  }
2377
2626
 
2378
2627
  static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
2379
- // FIXME: not thread safe, device may not be initialized yet
2380
- static cl_uint alignment = -1;
2381
- if (alignment == (cl_uint)-1) {
2382
- ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
2383
- alignment = backend_ctx->alignment;
2384
- }
2385
- return alignment;
2628
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
2629
+ return backend_ctx->alignment;
2386
2630
  }
2387
2631
 
2388
2632
  static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
@@ -2409,16 +2653,6 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
2409
2653
  /* .is_host = */ NULL,
2410
2654
  };
2411
2655
 
2412
- ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
2413
- static ggml_backend_buffer_type buffer_type = {
2414
- /* .iface = */ ggml_backend_opencl_buffer_type_interface,
2415
- /* .device = */ &g_ggml_backend_opencl_device,
2416
- /* .context = */ nullptr,
2417
- };
2418
-
2419
- return &buffer_type;
2420
- }
2421
-
2422
2656
  //
2423
2657
  // backend device
2424
2658
  //
@@ -2476,9 +2710,15 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
2476
2710
  }
2477
2711
 
2478
2712
  static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
2479
- return ggml_backend_opencl_buffer_type();
2713
+ auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(dev->context);
2480
2714
 
2481
- GGML_UNUSED(dev);
2715
+ dev_ctx->buffer_type = ggml_backend_buffer_type{
2716
+ /* .iface = */ ggml_backend_opencl_buffer_type_interface,
2717
+ /* .device = */ dev,
2718
+ /* .context = */ nullptr,
2719
+ };
2720
+
2721
+ return &dev_ctx->buffer_type;
2482
2722
  }
2483
2723
 
2484
2724
  static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
@@ -2494,12 +2734,21 @@ static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const
2494
2734
  }
2495
2735
 
2496
2736
  static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2497
- return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;
2737
+ // Check 'dev' and 'buffer_type' are not objects belonging to this backend.
2738
+ if (dev->iface.get_name != ggml_backend_opencl_device_get_name ||
2739
+ buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) {
2740
+ return false;
2741
+ }
2498
2742
 
2499
- GGML_UNUSED(dev);
2743
+ // Check cl_context is the same. clEnqueue* commands may not use
2744
+ // buffers from another cl_context.
2745
+ ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
2746
+ ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
2747
+ return backend_ctx0->context == backend_ctx1->context;
2500
2748
  }
2501
2749
 
2502
- static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
2750
+ namespace /* anonymous */ {
2751
+ struct ggml_backend_device_i ggml_backend_opencl_device_i = {
2503
2752
  /* .get_name = */ ggml_backend_opencl_device_get_name,
2504
2753
  /* .get_description = */ ggml_backend_opencl_device_get_description,
2505
2754
  /* .get_memory = */ ggml_backend_opencl_device_get_memory,
@@ -2516,6 +2765,7 @@ static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
2516
2765
  /* .event_free = */ NULL,
2517
2766
  /* .event_synchronize = */ NULL,
2518
2767
  };
2768
+ }
2519
2769
 
2520
2770
  // Backend registry
2521
2771
 
@@ -2526,15 +2776,15 @@ static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
2526
2776
  }
2527
2777
 
2528
2778
  static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
2529
- return ggml_backend_opencl_n_devices;
2779
+ return g_ggml_backend_opencl_devices.size();
2530
2780
 
2531
2781
  GGML_UNUSED(reg);
2532
2782
  }
2533
2783
 
2534
2784
  static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
2535
- GGML_ASSERT(index == 0);
2785
+ GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));
2536
2786
 
2537
- return &g_ggml_backend_opencl_device;
2787
+ return &g_ggml_backend_opencl_devices[index];
2538
2788
 
2539
2789
  GGML_UNUSED(reg);
2540
2790
  GGML_UNUSED(index);
@@ -2548,27 +2798,23 @@ static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
2548
2798
  };
2549
2799
 
2550
2800
  ggml_backend_reg_t ggml_backend_opencl_reg(void) {
2551
- // TODO: make this thread-safe somehow?
2801
+ static std::mutex mutex;
2552
2802
  static ggml_backend_reg reg;
2553
2803
  static bool initialized = false;
2804
+ std::lock_guard<std::mutex> lock(mutex);
2554
2805
 
2555
- if (!initialized) {
2556
- reg = ggml_backend_reg {
2557
- /* .api_version = */ GGML_BACKEND_API_VERSION,
2558
- /* .iface = */ ggml_backend_opencl_reg_i,
2559
- /* .context = */ NULL,
2560
- };
2561
-
2562
- g_ggml_backend_opencl_device = ggml_backend_device {
2563
- /* .iface = */ ggml_backend_opencl_device_i,
2564
- /* .reg = */ &reg,
2565
- /* .context = */ &g_ggml_ctx_dev_main,
2566
- };
2806
+ if (initialized) {
2807
+ return &reg;
2808
+ }
2809
+ initialized = true;
2567
2810
 
2568
- ggml_cl2_init(&g_ggml_backend_opencl_device);
2811
+ g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(&reg);
2569
2812
 
2570
- initialized = true;
2571
- }
2813
+ reg = ggml_backend_reg{
2814
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
2815
+ /* .iface = */ ggml_backend_opencl_reg_i,
2816
+ /* .context = */ NULL,
2817
+ };
2572
2818
 
2573
2819
  return &reg;
2574
2820
  }
@@ -2942,14 +3188,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
2942
3188
  size_t global_work_size[] = {(size_t)n, 1, 1};
2943
3189
  size_t local_work_size[] = {64, 1, 1};
2944
3190
 
3191
+ size_t * local_work_size_ptr = local_work_size;
3192
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3193
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3194
+ }
3195
+
2945
3196
  #ifdef GGML_OPENCL_PROFILING
2946
3197
  cl_event evt;
2947
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3198
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
2948
3199
 
2949
3200
  g_profiling_info.emplace_back();
2950
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3201
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
2951
3202
  #else
2952
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3203
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
2953
3204
  #endif
2954
3205
  } else {
2955
3206
  unsigned int nth = MIN(64, ne0);
@@ -3077,14 +3328,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3077
3328
  size_t global_work_size[] = {(size_t)n, 1, 1};
3078
3329
  size_t local_work_size[] = {64, 1, 1};
3079
3330
 
3331
+ size_t * local_work_size_ptr = local_work_size;
3332
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3333
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3334
+ }
3335
+
3080
3336
  #ifdef GGML_OPENCL_PROFILING
3081
3337
  cl_event evt;
3082
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3338
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3083
3339
 
3084
3340
  g_profiling_info.emplace_back();
3085
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3341
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3086
3342
  #else
3087
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3343
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3088
3344
  #endif
3089
3345
  } else {
3090
3346
  unsigned int nth = MIN(64, ne0);
@@ -3103,54 +3359,304 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3103
3359
  }
3104
3360
  }
3105
3361
 
3106
- static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3362
+ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3107
3363
  GGML_ASSERT(src0);
3108
3364
  GGML_ASSERT(src0->extra);
3365
+ GGML_ASSERT(src1);
3366
+ GGML_ASSERT(src1->extra);
3109
3367
  GGML_ASSERT(dst);
3110
3368
  GGML_ASSERT(dst->extra);
3111
3369
 
3112
- UNUSED(src1);
3370
+ const int ne00 = src0->ne[0];
3371
+ const int ne01 = src0->ne[1];
3372
+ const int ne02 = src0->ne[2];
3373
+ const int ne03 = src0->ne[3];
3374
+
3375
+ const cl_ulong nb00 = src0->nb[0];
3376
+ const cl_ulong nb01 = src0->nb[1];
3377
+ const cl_ulong nb02 = src0->nb[2];
3378
+ const cl_ulong nb03 = src0->nb[3];
3379
+
3380
+ const int ne10 = src1->ne[0];
3381
+ const int ne11 = src1->ne[1];
3382
+ const int ne12 = src1->ne[2];
3383
+ const int ne13 = src1->ne[3];
3384
+
3385
+ const cl_ulong nb10 = src1->nb[0];
3386
+ const cl_ulong nb11 = src1->nb[1];
3387
+ const cl_ulong nb12 = src1->nb[2];
3388
+ const cl_ulong nb13 = src1->nb[3];
3389
+
3390
+ const int ne0 = dst->ne[0];
3391
+
3392
+ const cl_ulong nb0 = dst->nb[0];
3393
+ const cl_ulong nb1 = dst->nb[1];
3394
+ const cl_ulong nb2 = dst->nb[2];
3395
+ const cl_ulong nb3 = dst->nb[3];
3113
3396
 
3114
3397
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3115
3398
  cl_command_queue queue = backend_ctx->queue;
3116
3399
 
3117
3400
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3401
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3118
3402
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3119
3403
 
3120
3404
  cl_ulong offset0 = extra0->offset + src0->view_offs;
3405
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
3121
3406
  cl_ulong offsetd = extrad->offset + dst->view_offs;
3122
3407
 
3408
+ bool bcast_row = false;
3123
3409
  cl_kernel kernel;
3124
3410
 
3125
- int n = ggml_nelements(dst);
3411
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
3412
+ GGML_ASSERT(ggml_is_contiguous(src0));
3126
3413
 
3127
- if (n % 4 == 0) {
3128
- kernel = backend_ctx->kernel_gelu_4;
3129
- n /= 4;
3414
+ // src1 is a row
3415
+ GGML_ASSERT(ne11 == 1);
3416
+
3417
+ bcast_row = true;
3418
+ int ne = ne00 / 4;
3419
+ kernel = backend_ctx->kernel_div_row;
3420
+
3421
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3422
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3423
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3424
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3425
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3426
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3427
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
3130
3428
  } else {
3131
- kernel = backend_ctx->kernel_gelu;
3429
+ kernel = backend_ctx->kernel_div;
3430
+
3431
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3432
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3433
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3434
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3435
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3436
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3437
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
3438
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3439
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3440
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
3441
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
3442
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
3443
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
3444
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
3445
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
3446
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
3447
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
3448
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
3449
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
3450
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
3451
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
3452
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
3453
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
3132
3454
  }
3133
3455
 
3134
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3135
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3136
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
3137
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
3456
+ if (bcast_row) {
3457
+ int n = ggml_nelements(dst)/4;
3458
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3459
+ size_t local_work_size[] = {64, 1, 1};
3138
3460
 
3139
- size_t global_work_size[] = {(size_t)n, 1, 1};
3140
- size_t local_work_size[] = {64, 1, 1};
3461
+ #ifdef GGML_OPENCL_PROFILING
3462
+ cl_event evt;
3463
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3464
+
3465
+ g_profiling_info.emplace_back();
3466
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3467
+ #else
3468
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3469
+ #endif
3470
+ } else {
3471
+ unsigned int nth = MIN(64, ne0);
3472
+ size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3473
+ size_t local_work_size[] = {nth, 1, 1};
3141
3474
 
3142
3475
  #ifdef GGML_OPENCL_PROFILING
3143
- cl_event evt;
3144
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3476
+ cl_event evt;
3477
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3145
3478
 
3146
- g_profiling_info.emplace_back();
3147
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3479
+ g_profiling_info.emplace_back();
3480
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3148
3481
  #else
3149
- clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3482
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3150
3483
  #endif
3484
+ }
3151
3485
  }
3152
3486
 
3153
- static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3487
+ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3488
+ GGML_ASSERT(src0);
3489
+ GGML_ASSERT(src0->extra);
3490
+ GGML_ASSERT(src1);
3491
+ GGML_ASSERT(src1->extra);
3492
+ GGML_ASSERT(dst);
3493
+ GGML_ASSERT(dst->extra);
3494
+
3495
+ const int ne00 = src0->ne[0];
3496
+ const int ne01 = src0->ne[1];
3497
+ const int ne02 = src0->ne[2];
3498
+ const int ne03 = src0->ne[3];
3499
+
3500
+ const cl_ulong nb00 = src0->nb[0];
3501
+ const cl_ulong nb01 = src0->nb[1];
3502
+ const cl_ulong nb02 = src0->nb[2];
3503
+ const cl_ulong nb03 = src0->nb[3];
3504
+
3505
+ const int ne10 = src1->ne[0];
3506
+ const int ne11 = src1->ne[1];
3507
+ const int ne12 = src1->ne[2];
3508
+ const int ne13 = src1->ne[3];
3509
+
3510
+ const cl_ulong nb10 = src1->nb[0];
3511
+ const cl_ulong nb11 = src1->nb[1];
3512
+ const cl_ulong nb12 = src1->nb[2];
3513
+ const cl_ulong nb13 = src1->nb[3];
3514
+
3515
+ const int ne0 = dst->ne[0];
3516
+
3517
+ const cl_ulong nb0 = dst->nb[0];
3518
+ const cl_ulong nb1 = dst->nb[1];
3519
+ const cl_ulong nb2 = dst->nb[2];
3520
+ const cl_ulong nb3 = dst->nb[3];
3521
+
3522
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3523
+ cl_command_queue queue = backend_ctx->queue;
3524
+
3525
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3526
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3527
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3528
+
3529
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3530
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
3531
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3532
+
3533
+ bool bcast_row = false;
3534
+ cl_kernel kernel;
3535
+
3536
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
3537
+ GGML_ASSERT(ggml_is_contiguous(src0));
3538
+
3539
+ // src1 is a row
3540
+ GGML_ASSERT(ne11 == 1);
3541
+
3542
+ bcast_row = true;
3543
+ int ne = ne00 / 4;
3544
+ kernel = backend_ctx->kernel_sub_row;
3545
+
3546
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3547
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3548
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3549
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3550
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3551
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3552
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
3553
+ } else {
3554
+ kernel = backend_ctx->kernel_sub;
3555
+
3556
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3557
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3558
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3559
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3560
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3561
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3562
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
3563
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3564
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3565
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
3566
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
3567
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
3568
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
3569
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
3570
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
3571
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
3572
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
3573
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
3574
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
3575
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
3576
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
3577
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
3578
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
3579
+ }
3580
+
3581
+ if (bcast_row) {
3582
+ int n = ggml_nelements(dst)/4;
3583
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3584
+ size_t local_work_size[] = {64, 1, 1};
3585
+
3586
+ #ifdef GGML_OPENCL_PROFILING
3587
+ cl_event evt;
3588
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3589
+
3590
+ g_profiling_info.emplace_back();
3591
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3592
+ #else
3593
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3594
+ #endif
3595
+ } else {
3596
+ unsigned int nth = MIN(64, ne0);
3597
+ size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
3598
+ size_t local_work_size[] = {nth, 1, 1};
3599
+
3600
+ #ifdef GGML_OPENCL_PROFILING
3601
+ cl_event evt;
3602
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3603
+
3604
+ g_profiling_info.emplace_back();
3605
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3606
+ #else
3607
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3608
+ #endif
3609
+ }
3610
+ }
3611
+
3612
+ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3613
+ GGML_ASSERT(src0);
3614
+ GGML_ASSERT(src0->extra);
3615
+ GGML_ASSERT(dst);
3616
+ GGML_ASSERT(dst->extra);
3617
+
3618
+ UNUSED(src1);
3619
+
3620
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3621
+ cl_command_queue queue = backend_ctx->queue;
3622
+
3623
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3624
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3625
+
3626
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3627
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3628
+
3629
+ cl_kernel kernel;
3630
+
3631
+ int n = ggml_nelements(dst);
3632
+
3633
+ if (n % 4 == 0) {
3634
+ kernel = backend_ctx->kernel_gelu_4;
3635
+ n /= 4;
3636
+ } else {
3637
+ kernel = backend_ctx->kernel_gelu;
3638
+ }
3639
+
3640
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3641
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3642
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
3643
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
3644
+
3645
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3646
+ size_t local_work_size[] = {64, 1, 1};
3647
+
3648
+ #ifdef GGML_OPENCL_PROFILING
3649
+ cl_event evt;
3650
+ clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3651
+
3652
+ g_profiling_info.emplace_back();
3653
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3654
+ #else
3655
+ clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3656
+ #endif
3657
+ }
3658
+
3659
+ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3154
3660
  GGML_ASSERT(src0);
3155
3661
  GGML_ASSERT(src0->extra);
3156
3662
  GGML_ASSERT(dst);
@@ -3233,14 +3739,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3233
3739
  size_t global_work_size[] = {(size_t)n, 1, 1};
3234
3740
  size_t local_work_size[] = {64, 1, 1};
3235
3741
 
3742
+ size_t * local_work_size_ptr = local_work_size;
3743
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3744
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3745
+ }
3746
+
3236
3747
  #ifdef GGML_OPENCL_PROFILING
3237
3748
  cl_event evt;
3238
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3749
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3239
3750
 
3240
3751
  g_profiling_info.emplace_back();
3241
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3752
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3242
3753
  #else
3243
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3754
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3244
3755
  #endif
3245
3756
  }
3246
3757
 
@@ -3273,14 +3784,71 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3273
3784
  size_t global_work_size[] = {(size_t)n, 1, 1};
3274
3785
  size_t local_work_size[] = {64, 1, 1};
3275
3786
 
3787
+ size_t * local_work_size_ptr = local_work_size;
3788
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3789
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3790
+ }
3791
+
3276
3792
  #ifdef GGML_OPENCL_PROFILING
3277
3793
  cl_event evt;
3278
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3794
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3279
3795
 
3280
3796
  g_profiling_info.emplace_back();
3281
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3797
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3282
3798
  #else
3283
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3799
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3800
+ #endif
3801
+ }
3802
+
3803
+ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3804
+ GGML_ASSERT(src0);
3805
+ GGML_ASSERT(src0->extra);
3806
+ GGML_ASSERT(dst);
3807
+ GGML_ASSERT(dst->extra);
3808
+
3809
+ UNUSED(src1);
3810
+
3811
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3812
+ cl_command_queue queue = backend_ctx->queue;
3813
+
3814
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3815
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3816
+
3817
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3818
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3819
+
3820
+ cl_kernel kernel;
3821
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3822
+ kernel = backend_ctx->kernel_sigmoid_f32;
3823
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3824
+ kernel = backend_ctx->kernel_sigmoid_f16;
3825
+ } else {
3826
+ GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
3827
+ }
3828
+
3829
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3830
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3831
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
3832
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
3833
+
3834
+ const int64_t n = ggml_nelements(dst);
3835
+
3836
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3837
+ size_t local_work_size[] = {64, 1, 1};
3838
+
3839
+ size_t * local_work_size_ptr = local_work_size;
3840
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3841
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3842
+ }
3843
+
3844
+ #ifdef GGML_OPENCL_PROFILING
3845
+ cl_event evt;
3846
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3847
+
3848
+ g_profiling_info.emplace_back();
3849
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3850
+ #else
3851
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3284
3852
  #endif
3285
3853
  }
3286
3854
 
@@ -3320,14 +3888,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
3320
3888
  size_t global_work_size[] = {(size_t)n, 1, 1};
3321
3889
  size_t local_work_size[] = {64, 1, 1};
3322
3890
 
3891
+ size_t * local_work_size_ptr = local_work_size;
3892
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3893
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3894
+ }
3895
+
3323
3896
  #ifdef GGML_OPENCL_PROFILING
3324
3897
  cl_event evt;
3325
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3898
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3326
3899
 
3327
3900
  g_profiling_info.emplace_back();
3328
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3901
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3329
3902
  #else
3330
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3903
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3331
3904
  #endif
3332
3905
  }
3333
3906
 
@@ -3476,6 +4049,65 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
3476
4049
  #endif
3477
4050
  }
3478
4051
 
4052
+ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4053
+ GGML_ASSERT(src0);
4054
+ GGML_ASSERT(src0->extra);
4055
+ GGML_ASSERT(dst);
4056
+ GGML_ASSERT(dst->extra);
4057
+
4058
+ UNUSED(src1);
4059
+
4060
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4061
+ cl_command_queue queue = backend_ctx->queue;
4062
+
4063
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4064
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4065
+
4066
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
4067
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
4068
+
4069
+ int32_t n_groups = ((const int32_t *) dst->op_params)[0];
4070
+ int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
4071
+ float eps = ((const float *) dst->op_params)[1];
4072
+
4073
+ const int ne00 = src0->ne[0];
4074
+ const int ne01 = src0->ne[1];
4075
+ const int ne02 = src0->ne[2];
4076
+ const int ne = ne00*ne01*ne02;
4077
+
4078
+ cl_kernel kernel = backend_ctx->kernel_group_norm;
4079
+
4080
+ size_t sgs = 64;
4081
+ if (backend_ctx->gpu_family == ADRENO) {
4082
+ sgs = 64;
4083
+ } else if (backend_ctx->gpu_family == INTEL) {
4084
+ sgs = 32;
4085
+ } else {
4086
+ GGML_ASSERT(false && "Unsupported GPU");
4087
+ }
4088
+
4089
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
4090
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
4091
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4092
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
4093
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
4094
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
4095
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
4096
+
4097
+ size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
4098
+ size_t local_work_size[] = {(size_t)sgs, 1, 1};
4099
+
4100
+ #ifdef GGML_OPENCL_PROFILING
4101
+ cl_event evt;
4102
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4103
+
4104
+ g_profiling_info.emplace_back();
4105
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4106
+ #else
4107
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4108
+ #endif
4109
+ }
4110
+
3479
4111
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3480
4112
  GGML_ASSERT(src0);
3481
4113
  GGML_ASSERT(src0->extra);
@@ -4230,14 +4862,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
4230
4862
  size_t global_work_size[] = {(size_t)n, 1, 1};
4231
4863
  size_t local_work_size[] = {64, 1, 1};
4232
4864
 
4865
+ size_t * local_work_size_ptr = local_work_size;
4866
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
4867
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4868
+ }
4869
+
4233
4870
  #ifdef GGML_OPENCL_PROFILING
4234
4871
  cl_event evt;
4235
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4872
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4236
4873
 
4237
4874
  g_profiling_info.emplace_back();
4238
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4875
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4239
4876
  #else
4240
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4877
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4241
4878
  #endif
4242
4879
  }
4243
4880
 
@@ -4418,14 +5055,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
4418
5055
  size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
4419
5056
  size_t local_work_size[] = {64, 1, 1};
4420
5057
 
5058
+ size_t * local_work_size_ptr = local_work_size;
5059
+ if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
5060
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5061
+ }
5062
+
4421
5063
  #ifdef GGML_OPENCL_PROFILING
4422
5064
  cl_event evt;
4423
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5065
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4424
5066
 
4425
5067
  g_profiling_info.emplace_back();
4426
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5068
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4427
5069
  #else
4428
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5070
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4429
5071
  #endif
4430
5072
  }
4431
5073
  }
@@ -4815,6 +5457,124 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
4815
5457
  #endif
4816
5458
  }
4817
5459
 
5460
+ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5461
+ GGML_ASSERT(src0);
5462
+ GGML_ASSERT(src0->extra);
5463
+ GGML_ASSERT(dst);
5464
+ GGML_ASSERT(dst->extra);
5465
+ GGML_UNUSED(src1);
5466
+
5467
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5468
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
5469
+ GGML_ASSERT(ggml_is_contiguous(src0));
5470
+
5471
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5472
+ cl_command_queue queue = backend_ctx->queue;
5473
+
5474
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5475
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5476
+
5477
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
5478
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
5479
+
5480
+ const int ne00 = src0->ne[0];
5481
+ const int nrows = ggml_nrows(src0);
5482
+
5483
+ int ne00_padded = 1;
5484
+ while (ne00_padded < ne00) {
5485
+ ne00_padded *= 2;
5486
+ }
5487
+
5488
+ int order = (enum ggml_sort_order) dst->op_params[0];
5489
+
5490
+ cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
5491
+
5492
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5493
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5494
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
5495
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
5496
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
5497
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
5498
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
5499
+ CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
5500
+
5501
+ size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
5502
+ size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
5503
+
5504
+ #ifdef GGML_OPENCL_PROFILING
5505
+ cl_event evt;
5506
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5507
+
5508
+ g_profiling_info.emplace_back();
5509
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5510
+ #else
5511
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5512
+ #endif
5513
+ }
5514
+
5515
+ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5516
+ GGML_ASSERT(src0);
5517
+ GGML_ASSERT(src0->extra);
5518
+ GGML_ASSERT(dst);
5519
+ GGML_ASSERT(dst->extra);
5520
+ GGML_UNUSED(src1);
5521
+
5522
+ GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
5523
+ GGML_ASSERT(ggml_is_contiguous(src0));
5524
+
5525
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5526
+ cl_command_queue queue = backend_ctx->queue;
5527
+
5528
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5529
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5530
+
5531
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
5532
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
5533
+
5534
+ const int ne00 = src0->ne[0];
5535
+ const int ne01 = src0->ne[1];
5536
+ const int ne02 = src0->ne[2];
5537
+ const int ne03 = src0->ne[3];
5538
+
5539
+ const cl_ulong nb01 = src0->nb[1];
5540
+ const cl_ulong nb02 = src0->nb[2];
5541
+ const cl_ulong nb03 = src0->nb[3];
5542
+
5543
+ const cl_ulong nb1 = dst->nb[1];
5544
+ const cl_ulong nb2 = dst->nb[2];
5545
+ const cl_ulong nb3 = dst->nb[3];
5546
+
5547
+ cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
5548
+
5549
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5550
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5551
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
5552
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
5553
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
5554
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
5555
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
5556
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
5557
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
5558
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
5559
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
5560
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
5561
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
5562
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
5563
+
5564
+ size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
5565
+ size_t local_work_size[] = {(size_t)64, 1, 1};
5566
+
5567
+ #ifdef GGML_OPENCL_PROFILING
5568
+ cl_event evt;
5569
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5570
+
5571
+ g_profiling_info.emplace_back();
5572
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5573
+ #else
5574
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5575
+ #endif
5576
+ }
5577
+
4818
5578
  //------------------------------------------------------------------------------
4819
5579
  // Op offloading
4820
5580
  //------------------------------------------------------------------------------
@@ -4855,8 +5615,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
4855
5615
  if (!any_on_device) {
4856
5616
  return false;
4857
5617
  }
4858
- GGML_ASSERT(ggml_is_contiguous(src0));
4859
- GGML_ASSERT(ggml_is_contiguous(src1));
4860
5618
  func = ggml_cl_add;
4861
5619
  break;
4862
5620
  case GGML_OP_MUL:
@@ -4865,6 +5623,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
4865
5623
  }
4866
5624
  func = ggml_cl_mul;
4867
5625
  break;
5626
+ case GGML_OP_DIV:
5627
+ if (!any_on_device) {
5628
+ return false;
5629
+ }
5630
+ func = ggml_cl_div;
5631
+ break;
5632
+ case GGML_OP_SUB:
5633
+ if (!any_on_device) {
5634
+ return false;
5635
+ }
5636
+ func = ggml_cl_sub;
5637
+ break;
4868
5638
  case GGML_OP_UNARY:
4869
5639
  switch (ggml_get_unary_op(tensor)) {
4870
5640
  case GGML_UNARY_OP_GELU:
@@ -4891,6 +5661,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
4891
5661
  }
4892
5662
  func = ggml_cl_relu;
4893
5663
  break;
5664
+ case GGML_UNARY_OP_SIGMOID:
5665
+ if (!any_on_device) {
5666
+ return false;
5667
+ }
5668
+ func = ggml_cl_sigmoid;
5669
+ break;
4894
5670
  default:
4895
5671
  return false;
4896
5672
  } break;
@@ -4912,6 +5688,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
4912
5688
  }
4913
5689
  func = ggml_cl_rms_norm;
4914
5690
  break;
5691
+ case GGML_OP_GROUP_NORM:
5692
+ if (!any_on_device) {
5693
+ return false;
5694
+ }
5695
+ func = ggml_cl_group_norm;
5696
+ break;
4915
5697
  case GGML_OP_MUL_MAT:
4916
5698
  if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
4917
5699
  return false;
@@ -4957,6 +5739,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
4957
5739
  }
4958
5740
  func = ggml_cl_im2col;
4959
5741
  break;
5742
+ case GGML_OP_ARGSORT:
5743
+ if (!any_on_device) {
5744
+ return false;
5745
+ }
5746
+ func = ggml_cl_argsort;
5747
+ break;
5748
+ case GGML_OP_SUM_ROWS:
5749
+ if (!any_on_device) {
5750
+ return false;
5751
+ }
5752
+ func = ggml_cl_sum_rows;
5753
+ break;
4960
5754
  default:
4961
5755
  return false;
4962
5756
  }