@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -0,0 +1,4004 @@
1
+ #define CL_TARGET_OPENCL_VERSION 220
2
+ #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
3
+
4
+ // suppress warnings in CL headers for GCC and Clang
5
+ #pragma GCC diagnostic ignored "-Woverlength-strings"
6
+ #ifdef __clang__
7
+ #pragma GCC diagnostic ignored "-Wgnu-anonymous-struct"
8
+ #endif
9
+
10
+ #include "ggml-opencl.h"
11
+ #include "ggml-backend.h"
12
+ #include "ggml-impl.h"
13
+ #include "ggml-backend-impl.h"
14
+ #include "ggml.h"
15
+
16
+ #include <CL/cl.h>
17
+
18
+ #include <string.h>
19
+
20
+ #include <cstddef>
21
+ #include <cstdint>
22
+ #include <atomic>
23
+ #include <fstream>
24
+ #include <limits>
25
+ #include <vector>
26
+ #include <string>
27
+ #include <cmath>
28
+
29
+ #undef MIN
30
+ #undef MAX
31
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
32
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
33
+
34
+ #define UNUSED(x) (void)(x)
35
+
36
+ #define CL_CHECK(err) \
37
+ do { \
38
+ cl_int err_ = (err); \
39
+ if (err_ != CL_SUCCESS) { \
40
+ GGML_LOG_ERROR("ggml_opencl: %s error %d at %s:%d\n", \
41
+ #err, err_, __FILE__, __LINE__); \
42
+ GGML_ASSERT(0); \
43
+ } \
44
+ } while (0)
45
+
46
+ //------------------------------------------------------------------------------
47
+ // OpenCL
48
+ //------------------------------------------------------------------------------
49
+
50
+ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
51
+
52
+ enum GPU_FAMILY {
53
+ ADRENO,
54
+ INTEL,
55
+ UNKNOWN,
56
+ };
57
+
58
+ enum ADRENO_GPU_GEN {
59
+ ADRENO_UNKNOWN,
60
+ A7X,
61
+ A8X,
62
+ X1E,
63
+ };
64
+
65
+ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
66
+ if (strstr(device_name, "730") ||
67
+ strstr(device_name, "740") ||
68
+ strstr(device_name, "750")) {
69
+ return ADRENO_GPU_GEN::A7X;
70
+ }
71
+
72
+ if (strstr(device_name, "830")) {
73
+ return ADRENO_GPU_GEN::A8X;
74
+ }
75
+
76
+ if (strstr(device_name, "X1")) {
77
+ return ADRENO_GPU_GEN::X1E;
78
+ }
79
+
80
+ return ADRENO_GPU_GEN::ADRENO_UNKNOWN;
81
+ }
82
+
83
+ static int get_adreno_cl_compiler_version(const char *driver_version) {
84
+ std::string driver_ver_str(driver_version);
85
+ size_t compiler_ver_pos = driver_ver_str.find("E031");
86
+ size_t compiler_ver_len = 13;
87
+ size_t compiler_ver_offset = 5;
88
+
89
+ if (compiler_ver_pos == std::string::npos) {
90
+ compiler_ver_pos = driver_ver_str.find("DX");
91
+ if (compiler_ver_pos == std::string::npos) {
92
+ return -1;
93
+ }
94
+ compiler_ver_len = 11;
95
+ compiler_ver_offset = 3;
96
+ }
97
+
98
+ std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len);
99
+ std::string major_ver_str = compiler_ver_str.substr(compiler_ver_offset, 2);
100
+ return std::atoi(major_ver_str.c_str());
101
+ }
102
+
103
+ // backend device context
104
+ struct ggml_backend_opencl_device_context {
105
+ cl_platform_id platform;
106
+ std::string platform_name;
107
+
108
+ cl_device_id device;
109
+ std::string device_name;
110
+ };
111
+
112
+ // backend context
113
+ struct ggml_backend_opencl_context {
114
+ cl_device_id device;
115
+ std::string device_name;
116
+
117
+ std::string driver_version;
118
+
119
+ GPU_FAMILY gpu_family;
120
+ ADRENO_GPU_GEN adreno_gen;
121
+
122
+ cl_int alignment;
123
+ size_t max_alloc_size;
124
+ bool fp16_support;
125
+
126
+ int adreno_wave_size;
127
+
128
+ cl_context context;
129
+ cl_command_queue queue;
130
+
131
+ cl_program program;
132
+ cl_program program_1;
133
+ cl_program program_2;
134
+
135
+ cl_kernel kernel_add, kernel_add_row;
136
+ cl_kernel kernel_mul, kernel_mul_row;
137
+ cl_kernel kernel_scale;
138
+ cl_kernel kernel_silu, kernel_silu_4;
139
+ cl_kernel kernel_gelu, kernel_gelu_4;
140
+ cl_kernel kernel_relu;
141
+ cl_kernel kernel_clamp;
142
+ cl_kernel kernel_norm;
143
+ cl_kernel kernel_rms_norm;
144
+ cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
145
+ cl_kernel kernel_soft_max, kernel_soft_max_4;
146
+ cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
147
+ cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
148
+ cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
149
+ cl_kernel kernel_mul_mat_f32_f32;
150
+ cl_kernel kernel_mul_mat_f16_f16;
151
+ cl_kernel kernel_mul_mat_f16_f32_1row;
152
+ cl_kernel kernel_mul_mat_f16_f32;
153
+ cl_kernel kernel_mul_mat_f16_f32_l4;
154
+ cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
155
+ cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat;
156
+ cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
157
+ cl_kernel kernel_convert_block_q4_0_noshuffle, kernel_mul_mat_q4_0_f32_flat_v0,
158
+ kernel_mul_mat_q4_0_f32_flat_img_v0;
159
+ cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
160
+ cl_kernel kernel_mul_mv_q6_K_f32;
161
+
162
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
163
+ // Transpose kernels
164
+ cl_program program_transpose_32;
165
+ cl_program program_transpose_32_16;
166
+ cl_program program_transpose_16;
167
+ cl_kernel kernel_transpose_32;
168
+ cl_kernel kernel_transpose_32_16;
169
+ cl_kernel kernel_transpose_16;
170
+
171
+ cl_mem A_s_d_max; // max scale buffer size for transpose
172
+ cl_mem A_q_d_max; // max weight buffer size for transpose
173
+ cl_mem B_d_max; // max activation buffer size for transpose
174
+
175
+ // Gemm and Gemv related programs, kernels, etc
176
+ cl_program program_CL_gemm;
177
+ cl_program program_CL_gemv_general;
178
+ cl_program program_CL_gemv_4096_1_11008;
179
+ cl_program program_CL_gemv_4096_1_4096;
180
+ cl_program program_CL_gemv_11008_1_4096;
181
+ cl_program program_CL_gemv_32000_1_4096;
182
+ cl_kernel CL_mul_mat_Ab_Bi_8x4;
183
+ cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
184
+ cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
185
+ cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
186
+ cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
187
+ cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
188
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
189
+ };
190
+
191
+ static ggml_backend_device g_ggml_backend_opencl_device;
192
+ static ggml_backend_opencl_device_context g_ggml_ctx_dev_main {
193
+ /*.platform =*/ nullptr,
194
+ /*.platform_nane =*/ "",
195
+ /*.device =*/ nullptr,
196
+ /*.device_name =*/ "",
197
+ };
198
+
199
+ static int ggml_backend_opencl_n_devices = 0;
200
+
201
+ // Profiling
202
+ #ifdef GGML_OPENCL_PROFILING
203
+ struct ProfilingInfo {
204
+ std::string op_name;
205
+ std::string kernel_name;
206
+ // Kernel execution time in nanoseconds.
207
+ cl_ulong duration_ns;
208
+ // Global and local work sizes.
209
+ size_t global_size[3];
210
+ size_t local_size[3];
211
+ // Op output size.
212
+ size_t output_size[4];
213
+ };
214
+
215
+ std::vector<ProfilingInfo> g_profiling_info;
216
+ #endif
217
+
218
+ inline std::string read_file(const std::string &path) {
219
+ std::ifstream ifs(path);
220
+ if (!ifs) {
221
+ return "";
222
+ }
223
+ std::string text;
224
+ ifs.seekg(0, std::ios::end);
225
+ text.resize(ifs.tellg());
226
+ ifs.seekg(0, std::ios::beg);
227
+ ifs.read(&text[0], text.size());
228
+ return text;
229
+ }
230
+
231
+ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
232
+ cl_program p;
233
+ char *program_log;
234
+ size_t program_size;
235
+ size_t log_size;
236
+ int err;
237
+
238
+ program_size = strlen(program_buffer);
239
+
240
+ p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
241
+ if(err < 0) {
242
+ GGML_LOG_ERROR("OpenCL error creating program");
243
+ exit(1);
244
+ }
245
+
246
+ err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
247
+ if(err < 0) {
248
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
249
+ program_log = (char*) malloc(log_size + 1);
250
+ program_log[log_size] = '\0';
251
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
252
+ GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log);
253
+ free(program_log);
254
+ exit(1);
255
+ }
256
+
257
+ return p;
258
+ }
259
+
260
+ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
261
+ static bool initialized = false;
262
+ static ggml_backend_opencl_context *backend_ctx = nullptr;
263
+
264
+ if (initialized) {
265
+ return backend_ctx;
266
+ }
267
+
268
+ ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
269
+ GGML_ASSERT(dev_ctx);
270
+ GGML_ASSERT(dev_ctx->platform == nullptr);
271
+ GGML_ASSERT(dev_ctx->device == nullptr);
272
+ GGML_ASSERT(backend_ctx == nullptr);
273
+
274
+ initialized = true;
275
+ backend_ctx = new ggml_backend_opencl_context();
276
+ backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
277
+
278
+ cl_int err;
279
+
280
+ #ifdef GGML_PROFILE_OPENCL
281
+ GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
282
+ #endif
283
+
284
+ struct cl_device;
285
+ struct cl_platform {
286
+ cl_platform_id id;
287
+ unsigned number;
288
+ char name[128];
289
+ char vendor[128];
290
+ struct cl_device * devices;
291
+ unsigned n_devices;
292
+ struct cl_device * default_device;
293
+ };
294
+
295
+ struct cl_device {
296
+ struct cl_platform * platform;
297
+ cl_device_id id;
298
+ unsigned number;
299
+ cl_device_type type;
300
+ char name[128];
301
+ };
302
+
303
+ enum { NPLAT = 16, NDEV = 16 };
304
+
305
+ struct cl_platform platforms[NPLAT];
306
+ unsigned n_platforms = 0;
307
+ struct cl_device devices[NDEV];
308
+ unsigned n_devices = 0;
309
+ struct cl_device * default_device = NULL;
310
+
311
+ cl_platform_id platform_ids[NPLAT];
312
+ if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
313
+ GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
314
+ return backend_ctx;
315
+ }
316
+
317
+ for (unsigned i = 0; i < n_platforms; i++) {
318
+ struct cl_platform * p = &platforms[i];
319
+ p->number = i;
320
+ p->id = platform_ids[i];
321
+ CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
322
+ CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
323
+
324
+ cl_device_id device_ids[NDEV];
325
+ cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
326
+ if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
327
+ p->n_devices = 0;
328
+ } else {
329
+ CL_CHECK(clGetDeviceIDsError);
330
+ }
331
+ p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
332
+ p->default_device = NULL;
333
+
334
+ for (unsigned j = 0; j < p->n_devices; j++) {
335
+ struct cl_device * d = &devices[n_devices];
336
+ d->number = n_devices++;
337
+ d->id = device_ids[j];
338
+ d->platform = p;
339
+ CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
340
+ CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
341
+
342
+ if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
343
+ p->default_device = d;
344
+ }
345
+ }
346
+
347
+ if (default_device == NULL && p->default_device != NULL) {
348
+ default_device = p->default_device;
349
+ }
350
+ }
351
+
352
+ if (n_devices == 0) {
353
+ GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
354
+ return backend_ctx;
355
+ }
356
+
357
+ char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
358
+ char * user_device_string = getenv("GGML_OPENCL_DEVICE");
359
+ int user_platform_number = -1;
360
+ int user_device_number = -1;
361
+
362
+ unsigned n;
363
+ if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
364
+ user_platform_number = (int)n;
365
+ }
366
+ if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
367
+ user_device_number = (int)n;
368
+ }
369
+ if (user_platform_number != -1 && user_device_number != -1) {
370
+ cl_platform* platform = &platforms[user_platform_number];
371
+ if ((unsigned)user_device_number >= platform->n_devices) {
372
+ GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
373
+ exit(1);
374
+ }
375
+ default_device = &platform->devices[user_device_number];
376
+ } else {
377
+
378
+ struct cl_device * selected_devices = devices;
379
+ unsigned n_selected_devices = n_devices;
380
+
381
+ if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
382
+ for (unsigned i = 0; i < n_platforms; i++) {
383
+ struct cl_platform * p = &platforms[i];
384
+ if (strstr(p->name, user_platform_string) != NULL ||
385
+ strstr(p->vendor, user_platform_string) != NULL) {
386
+ user_platform_number = (int)i;
387
+ break;
388
+ }
389
+ }
390
+ if (user_platform_number == -1) {
391
+ GGML_LOG_ERROR("ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
392
+ exit(1);
393
+ }
394
+ }
395
+ if (user_platform_number != -1) {
396
+ struct cl_platform * p = &platforms[user_platform_number];
397
+ selected_devices = p->devices;
398
+ n_selected_devices = p->n_devices;
399
+ default_device = p->default_device;
400
+ if (n_selected_devices == 0) {
401
+ GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
402
+ exit(1);
403
+ }
404
+ }
405
+
406
+ if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
407
+ for (unsigned i = 0; i < n_selected_devices; i++) {
408
+ struct cl_device * d = &selected_devices[i];
409
+ if (strstr(d->name, user_device_string) != NULL) {
410
+ user_device_number = d->number;
411
+ break;
412
+ }
413
+ }
414
+ if (user_device_number == -1) {
415
+ GGML_LOG_ERROR("ggml_opencl: no device matching '%s' was found.\n", user_device_string);
416
+ exit(1);
417
+ }
418
+ }
419
+ if (user_device_number != -1) {
420
+ selected_devices = &devices[user_device_number];
421
+ n_selected_devices = 1;
422
+ default_device = &selected_devices[0];
423
+ }
424
+
425
+ GGML_ASSERT(n_selected_devices > 0);
426
+
427
+ if (default_device == NULL) {
428
+ default_device = &selected_devices[0];
429
+ }
430
+ }
431
+
432
+ GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
433
+ GGML_LOG_INFO("ggml_opencl: selecting device: '%s'\n", default_device->name);
434
+ if (default_device->type != CL_DEVICE_TYPE_GPU) {
435
+ GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
436
+ }
437
+
438
+ dev_ctx->platform = default_device->platform->id;
439
+ dev_ctx->device = default_device->id;
440
+ backend_ctx->device = default_device->id;
441
+
442
+ if (strstr(default_device->name, "Adreno")) {
443
+ backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
444
+ backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
445
+
446
+ // Default wave size is 128, A8x uses 64.
447
+ if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
448
+ backend_ctx->adreno_wave_size = 64;
449
+ } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
450
+ backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
451
+ backend_ctx->adreno_wave_size = 128;
452
+ } else {
453
+ backend_ctx->adreno_wave_size = 128;
454
+ GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
455
+ "using wave size %d, "
456
+ "may not work as expected\n",
457
+ backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
458
+ }
459
+ } else if (strstr(default_device->name, "Intel")) {
460
+ backend_ctx->gpu_family = GPU_FAMILY::INTEL;
461
+ } else {
462
+ GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
463
+ backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
464
+ return backend_ctx;
465
+ }
466
+
467
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
468
+ if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
469
+ GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
470
+ "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
471
+ return backend_ctx;
472
+ }
473
+ #endif
474
+
475
+ // Populate backend device name
476
+ dev_ctx->platform_name = default_device->platform->name;
477
+ dev_ctx->device_name = default_device->name;
478
+ backend_ctx->device_name = default_device->name;
479
+
480
+ // A local ref of cl_device_id for convenience
481
+ cl_device_id device = backend_ctx->device;
482
+
483
+ // Check device OpenCL version, OpenCL 2.0 or above is required
484
+ size_t device_ver_str_size;
485
+ clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
486
+ char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
487
+ clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
488
+ device_ver_buffer[device_ver_str_size] = '\0';
489
+ GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
490
+
491
+ if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
492
+ strstr(device_ver_buffer, "OpenCL 3") == NULL) {
493
+ GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
494
+ return backend_ctx;
495
+ }
496
+
497
+ // Check driver version
498
+ size_t driver_version_str_size;
499
+ clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &driver_version_str_size);
500
+ char *driver_version = (char *)alloca(driver_version_str_size + 1);
501
+ clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL);
502
+ driver_version[driver_version_str_size] = '\0';
503
+ GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
504
+ backend_ctx->driver_version = driver_version;
505
+
506
+ int adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
507
+ bool has_vector_subgroup_broadcast =
508
+ adreno_cl_compiler_version >= 47 || adreno_cl_compiler_version == 17;
509
+ GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
510
+ has_vector_subgroup_broadcast ? "true" : "false");
511
+
512
+ size_t ext_str_size;
513
+ clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
514
+ char *ext_buffer = (char *)alloca(ext_str_size + 1);
515
+ clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
516
+ ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
517
+ // Check if ext_buffer contains cl_khr_fp16
518
+ backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
519
+ GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
520
+
521
+ // fp16 is required
522
+ if (!backend_ctx->fp16_support) {
523
+ GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
524
+ return backend_ctx;
525
+ }
526
+
527
+ // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
528
+ // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
529
+ if (strstr(device_ver_buffer, "OpenCL 3") &&
530
+ strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
531
+ strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
532
+ GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
533
+ "(note that subgroups is an optional feature in OpenCL 3.0)\n");
534
+ return backend_ctx;
535
+ }
536
+
537
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL));
538
+ GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
539
+
540
+ clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
541
+ GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
542
+
543
+ // Check SVM.
544
+ cl_device_svm_capabilities svm_caps;
545
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
546
+ GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n",
547
+ svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false");
548
+ GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n",
549
+ svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false");
550
+ GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n",
551
+ svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false");
552
+ GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
553
+ svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
554
+
555
+ // Print out configurations
556
+ #ifdef GGML_OPENCL_SOA_Q
557
+ GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
558
+ #endif // GGML_OPENCL_SOA_Q
559
+
560
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
561
+ GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
562
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
563
+
564
+ cl_context_properties properties[] = {
565
+ (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
566
+ };
567
+
568
+ CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
569
+
570
+ // A local ref of cl_context for convenience
571
+ cl_context context = backend_ctx->context;
572
+
573
+ //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
574
+ // (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
575
+ // (queue = clCreateCommandQueue(context, device, 0, &err), err)
576
+ //)));
577
+ cl_command_queue_properties command_queue_props = 0;
578
+ #ifdef GGML_OPENCL_PROFILING
579
+ command_queue_props |= CL_QUEUE_PROFILING_ENABLE;
580
+ #endif
581
+ CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
582
+
583
+ #ifdef GGML_OPENCL_EMBED_KERNELS
584
+ const std::string kernel_src {
585
+ #include "ggml-opencl.cl.h"
586
+ };
587
+ #else
588
+ const std::string kernel_src = read_file("ggml-opencl.cl");
589
+ #endif
590
+
591
+ std::string compile_opts =
592
+ "-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
593
+ "-cl-finite-math-only -cl-fast-relaxed-math ";
594
+ backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
595
+
596
+ // Non matmul kernels.
597
+ CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f32", &err), err));
598
+ CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f16", &err), err));
599
+ CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program, "kernel_get_rows_q4_0", &err), err));
600
+ CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program, "kernel_add", &err), err));
601
+ CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program, "kernel_add_row", &err), err));
602
+ CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program, "kernel_mul", &err), err));
603
+ CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program, "kernel_mul_row", &err), err));
604
+ CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program, "kernel_scale", &err), err));
605
+ CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program, "kernel_silu", &err), err));
606
+ CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err));
607
+ CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err));
608
+ CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err));
609
+ CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err));
610
+ CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err));
611
+ CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err));
612
+ CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program, "kernel_rms_norm", &err), err));
613
+ CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf", &err), err));
614
+ CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
615
+ CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
616
+ CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
617
+ CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
618
+ CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
619
+ CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
620
+ CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err));
621
+ CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err));
622
+ CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err));
623
+ CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err));
624
+ CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f32", &err), err));
625
+
626
+ // Matmul kernels.
627
+ CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f32_f32", &err), err));
628
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f16", &err), err));
629
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_1row", &err), err));
630
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32", &err), err));
631
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_l4", &err), err));
632
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32", &err), err));
633
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_v", &err), err));
634
+
635
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_flat", &err), err));
636
+ CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_convert_block_q4_0", &err), err));
637
+ CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_restore_block_q4_0", &err), err));
638
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
639
+
640
+ // Load additional mulmat kernels.
641
+ #ifdef GGML_OPENCL_EMBED_KERNELS
642
+ const std::string kernel_src_1 {
643
+ #include "ggml-opencl_mm.cl.h"
644
+ };
645
+ #else
646
+ const std::string kernel_src_1 = read_file("ggml-opencl_mm.cl");
647
+ #endif
648
+ backend_ctx->program_1 = build_program_from_source(context, device, kernel_src_1.c_str(), compile_opts);
649
+
650
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
651
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
652
+ CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mv_q6_K_f32", &err), err));
653
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_v0", &err), err));
654
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_img_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_img_v0", &err), err));
655
+
656
+ // Load additional data conversion kernels.
657
+ #ifdef GGML_OPENCL_EMBED_KERNELS
658
+ const std::string kernel_src_2 {
659
+ #include "ggml-opencl_cvt.cl.h"
660
+ };
661
+ #else
662
+ const std::string kernel_src_2 = read_file("ggml-opencl_cvt.cl");
663
+ #endif
664
+ backend_ctx->program_2 = build_program_from_source(context, device, kernel_src_2.c_str(), compile_opts);
665
+
666
+ CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err));
667
+
668
+ // Kernels for Adreno
669
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
670
+ #ifdef GGML_OPENCL_EMBED_KERNELS
671
+ const std::string transpose_32_src {
672
+ #include "ggml-opencl_transpose_32.cl.h"
673
+ };
674
+ #else
675
+ const std::string transpose_32_src = read_file("ggml-opencl_transpose_32.cl");
676
+ #endif
677
+ backend_ctx->program_transpose_32 = build_program_from_source(context, device, transpose_32_src.c_str(), compile_opts);
678
+ CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose_32, "kernel_transpose_32", &err), err));
679
+
680
+ #ifdef GGML_OPENCL_EMBED_KERNELS
681
+ const std::string transpose_32_16_src {
682
+ #include "ggml-opencl_transpose_32_16.cl.h"
683
+ };
684
+ #else
685
+ const std::string transpose_32_16_src = read_file("ggml-opencl_transpose_32_16.cl");
686
+ #endif
687
+ backend_ctx->program_transpose_32_16 = build_program_from_source(context, device, transpose_32_16_src.c_str(), compile_opts);
688
+ CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose_32_16, "kernel_transpose_32_16", &err), err));
689
+
690
+ #ifdef GGML_OPENCL_EMBED_KERNELS
691
+ const std::string transpose_16_src {
692
+ #include "ggml-opencl_transpose_16.cl.h"
693
+ };
694
+ #else
695
+ const std::string transpose_16_src = read_file("ggml-opencl_transpose_16.cl");
696
+ #endif
697
+ backend_ctx->program_transpose_16 = build_program_from_source(context, device, transpose_16_src.c_str(), compile_opts);
698
+ CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
699
+
700
+ // Gemv general
701
+ std::string CL_gemv_compile_opts =
702
+ " -cl-std=CL2.0 "
703
+ " -cl-mad-enable "
704
+ " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
705
+ if (has_vector_subgroup_broadcast) {
706
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
707
+ }
708
+ #ifdef GGML_OPENCL_EMBED_KERNELS
709
+ const std::string kernel_src_CL_gemv_general {
710
+ #include "ggml-opencl_gemv_noshuffle_general.cl.h"
711
+ };
712
+ #else
713
+ const std::string kernel_src_CL_gemv_general = read_file("ggml-opencl_gemv_noshuffle_general.cl");
714
+ #endif
715
+
716
+ backend_ctx->program_CL_gemv_general = build_program_from_source(
717
+ context, device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
718
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
719
+
720
+ // Gemv 2048, 16384
721
+ CL_gemv_compile_opts =
722
+ " -cl-std=CL2.0 "
723
+ " -cl-mad-enable "
724
+ " -DLINE_STRIDE_A=2048 "
725
+ " -DBLOCK_STRIDE_A=16384 "
726
+ " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
727
+ if (has_vector_subgroup_broadcast) {
728
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
729
+ }
730
+ #ifdef GGML_OPENCL_EMBED_KERNELS
731
+ const std::string kernel_src_CL_gemv {
732
+ #include "ggml-opencl_gemv_noshuffle.cl.h"
733
+ };
734
+ #else
735
+ const std::string kernel_src_CL_gemv = read_file("ggml-opencl_gemv_noshuffle.cl");
736
+ #endif
737
+
738
+ backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
739
+ context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
740
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
741
+
742
+ // Gemv 2048, 16384
743
+ CL_gemv_compile_opts =
744
+ " -cl-std=CL2.0 "
745
+ " -cl-mad-enable "
746
+ " -DLINE_STRIDE_A=2048 "
747
+ " -DBLOCK_STRIDE_A=16384 "
748
+ " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
749
+ if (has_vector_subgroup_broadcast) {
750
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
751
+ }
752
+
753
+ backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
754
+ context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
755
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
756
+
757
+ // Gemv 5504, 44032
758
+ CL_gemv_compile_opts =
759
+ " -cl-std=CL2.0 "
760
+ " -cl-mad-enable "
761
+ " -DLINE_STRIDE_A=5504 "
762
+ " -DBLOCK_STRIDE_A=44032 "
763
+ " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
764
+ if (has_vector_subgroup_broadcast) {
765
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
766
+ }
767
+
768
+ backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
769
+ context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
770
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
771
+
772
+ // Gemv 16000, 128000
773
+ CL_gemv_compile_opts =
774
+ " -cl-std=CL2.0 "
775
+ " -cl-mad-enable "
776
+ " -DLINE_STRIDE_A=16000 "
777
+ " -DBLOCK_STRIDE_A=128000 "
778
+ " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
779
+ if (has_vector_subgroup_broadcast) {
780
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
781
+ }
782
+
783
+ backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
784
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
785
+
786
+ // Gemm
787
+ #ifdef GGML_OPENCL_EMBED_KERNELS
788
+ const std::string kernel_src_CL_gemm {
789
+ #include "ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h"
790
+ };
791
+ #else
792
+ const std::string kernel_src_CL_gemm = read_file("ggml-opencl_mul_mat_Ab_Bi_8x4.cl");
793
+ #endif
794
+ backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts);
795
+ CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
796
+
797
+ // Allocate intermediate buffers and images
798
+ size_t max_A_q_d_bytes = 311164928;
799
+ size_t max_A_s_d_bytes = 38895616;
800
+ size_t max_B_d_bytes = 45088768;
801
+
802
+ CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
803
+ CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
804
+ CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
805
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
806
+
807
+ // For now we support a single devices
808
+ ggml_backend_opencl_n_devices = 1;
809
+
810
+ return backend_ctx;
811
+ }
812
+
813
+ static void ggml_cl2_free(void) {
814
+ #ifdef GGML_OPENCL_PROFILING
815
+ FILE * fperf = fopen("cl_profiling.csv", "w");
816
+ if (!fperf) {
817
+ GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
818
+ return;
819
+ }
820
+
821
+ float total_kernel_time = 0;
822
+ fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n");
823
+ for (const ProfilingInfo & info : g_profiling_info) {
824
+ total_kernel_time += info.duration_ns/1.e6f;
825
+ fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
826
+ info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f,
827
+ info.global_size[0], info.global_size[1], info.global_size[2],
828
+ info.local_size[0], info.local_size[2], info.local_size[2],
829
+ info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
830
+ }
831
+ fclose(fperf);
832
+
833
+ GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
834
+ #endif
835
+ }
836
+
837
+ //------------------------------------------------------------------------------
838
+ // Tensor extra management
839
+ //------------------------------------------------------------------------------
840
+ struct ggml_tensor_extra_cl {
841
+ // The buffer object that holds the data.
842
+ cl_mem data_device;
843
+ // The offset into the buffer object. This is primarily for scratch buffer
844
+ // and view operation.
845
+ // NB: this offset no longer includes view offset (view_offs). Whenever this
846
+ // offset is used, view_offs should be considered.
847
+ cl_ulong offset;
848
+ // The actual size of the cl_mem object. This is needed when returning the
849
+ // block to the pool.
850
+ size_t actual_size;
851
+
852
+ void reset() {
853
+ data_device = nullptr;
854
+ offset = 0;
855
+ actual_size = 0;
856
+ }
857
+ };
858
+
859
+ // Additional tensor extra structs for quantized tensors.
860
+ // These tensors are loaded from files and should not be allocated in scratch --
861
+ // they should always be allocated from the pool. Hence, they do not have an
862
+ // `offset`, which indicate their locations in the scratch buffer.
863
+ struct ggml_tensor_extra_cl_q4_0 {
864
+ // Quantized values.
865
+ cl_mem q = nullptr;
866
+ // Quantized values in image1d_buffer_t.
867
+ cl_mem q_img = nullptr;
868
+ // Scales.
869
+ cl_mem d = nullptr;
870
+ // Scales in image1d_buffer_t.
871
+ cl_mem d_img = nullptr;
872
+ // Size of quantized values.
873
+ size_t size_q = 0;
874
+ // Size of scales.
875
+ size_t size_d = 0;
876
+
877
+ ~ggml_tensor_extra_cl_q4_0() {
878
+ reset();
879
+ }
880
+
881
+ void reset() {
882
+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
883
+ // They must be properly released so that the original buffer can be
884
+ // properly released to avoid memory leak.
885
+ if (q != nullptr) {
886
+ CL_CHECK(clReleaseMemObject(q));
887
+ q = nullptr;
888
+ }
889
+ if (d != nullptr) {
890
+ CL_CHECK(clReleaseMemObject(d));
891
+ d = nullptr;
892
+ }
893
+ // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
894
+ // enabled. They point to the images in ggml_backend_opencl_buffer_context.
895
+ // So, there is no need to release them here.
896
+ // TODO: initialize them for non SMALL_PATH path, or remove them.
897
+ q_img = nullptr;
898
+ d_img = nullptr;
899
+ size_q = 0;
900
+ size_d = 0;
901
+ }
902
+ };
903
+
904
+ //------------------------------------------------------------------------------
905
+ // Backend API
906
+ //------------------------------------------------------------------------------
907
+
908
+ //
909
+ // backend
910
+ //
911
+ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
912
+ return "OpenCL";
913
+
914
+ UNUSED(backend);
915
+ }
916
+
917
+ static void ggml_backend_opencl_free(ggml_backend_t backend) {
918
+ ggml_cl2_free();
919
+
920
+ GGML_UNUSED(backend);
921
+ }
922
+
923
+ static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
924
+ GGML_UNUSED(backend);
925
+ GGML_UNUSED(tensor);
926
+ GGML_UNUSED(data);
927
+ GGML_UNUSED(offset);
928
+ GGML_UNUSED(size);
929
+ }
930
+
931
+ static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
932
+ GGML_UNUSED(backend);
933
+ GGML_UNUSED(tensor);
934
+ GGML_UNUSED(data);
935
+ GGML_UNUSED(offset);
936
+ GGML_UNUSED(size);
937
+ }
938
+
939
+ static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
940
+ GGML_UNUSED(backend);
941
+ GGML_UNUSED(src);
942
+ GGML_UNUSED(dst);
943
+ return false;
944
+ }
945
+
946
+ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
947
+ GGML_UNUSED(backend);
948
+ }
949
+
950
+ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
951
+ for (int i = 0; i < cgraph->n_nodes; i++) {
952
+ ggml_tensor * node = cgraph->nodes[i];
953
+
954
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
955
+ continue;
956
+ }
957
+
958
+ bool ok = ggml_cl_compute_forward(backend, node);
959
+ if (!ok) {
960
+ GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
961
+ }
962
+ GGML_ASSERT(ok);
963
+ }
964
+
965
+ return GGML_STATUS_SUCCESS;
966
+ }
967
+
968
+ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
969
+ GGML_UNUSED(dev);
970
+
971
+ switch (op->op) {
972
+ case GGML_OP_NONE:
973
+ return true;
974
+ case GGML_OP_GET_ROWS:
975
+ switch (op->src[0]->type) {
976
+ case GGML_TYPE_F32:
977
+ case GGML_TYPE_F16:
978
+ return true;
979
+ case GGML_TYPE_Q4_0:
980
+ #ifdef GGML_OPENCL_SOA_Q
981
+ // We do not support flattened Q4_0 (and possibly other Q's)
982
+ return false;
983
+ #else // GGML_OPENCL_SOA_Q
984
+ return true;
985
+ #endif // GGML_OPENCL_SOA_Q
986
+ default:
987
+ return false;
988
+ }
989
+ case GGML_OP_CPY:
990
+ case GGML_OP_DUP:
991
+ case GGML_OP_CONT:
992
+ switch (op->src[0]->type) {
993
+ case GGML_TYPE_F32:
994
+ switch (op->type) {
995
+ case GGML_TYPE_F16:
996
+ case GGML_TYPE_F32:
997
+ return true;
998
+ default:
999
+ return false;
1000
+ }
1001
+ case GGML_TYPE_F16:
1002
+ switch (op->type) {
1003
+ case GGML_TYPE_F16:
1004
+ case GGML_TYPE_F32:
1005
+ return true;
1006
+ default:
1007
+ return false;
1008
+ }
1009
+ default:
1010
+ return false;
1011
+ }
1012
+ case GGML_OP_ADD:
1013
+ case GGML_OP_SCALE:
1014
+ case GGML_OP_MUL:
1015
+ return true;
1016
+ case GGML_OP_UNARY:
1017
+ switch (ggml_get_unary_op(op)) {
1018
+ case GGML_UNARY_OP_GELU:
1019
+ case GGML_UNARY_OP_SILU:
1020
+ case GGML_UNARY_OP_RELU:
1021
+ return ggml_is_contiguous(op->src[0]);
1022
+ default:
1023
+ return false;
1024
+ }
1025
+ case GGML_OP_CLAMP:
1026
+ case GGML_OP_SOFT_MAX:
1027
+ case GGML_OP_NORM:
1028
+ case GGML_OP_RMS_NORM:
1029
+ return true;
1030
+ case GGML_OP_MUL_MAT:
1031
+ if (op->src[0]->type == GGML_TYPE_F16) {
1032
+ return true;
1033
+ } else if (op->src[0]->type == GGML_TYPE_F32) {
1034
+ return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
1035
+ } else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
1036
+ op->src[0]->type == GGML_TYPE_Q6_K) {
1037
+ return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
1038
+ }
1039
+ return false;
1040
+ case GGML_OP_RESHAPE:
1041
+ case GGML_OP_VIEW:
1042
+ case GGML_OP_PERMUTE:
1043
+ case GGML_OP_TRANSPOSE:
1044
+ return true;
1045
+ case GGML_OP_DIAG_MASK_INF:
1046
+ return op->ne[3] == 1;
1047
+ case GGML_OP_ROPE:
1048
+ return true;
1049
+ default:
1050
+ return false;
1051
+ }
1052
+ }
1053
+
1054
+ // Forward declaration - implementation appears later in the file.
1055
+ static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type);
1056
+
1057
+ static ggml_guid_t ggml_backend_opencl_guid() {
1058
+ static ggml_guid guid = { 0xde, 0xe0, 0x70, 0xa2, 0x73, 0x4e, 0x4d, 0xbc, 0xb0, 0xc7, 0x4f, 0xd4, 0x6d, 0x4e, 0x90, 0xfe };
1059
+ return &guid;
1060
+ }
1061
+
1062
+ static ggml_backend_i ggml_backend_opencl_i = {
1063
+ /* .get_name = */ ggml_backend_opencl_name,
1064
+ /* .free = */ ggml_backend_opencl_free,
1065
+ /* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
1066
+ /* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
1067
+ /* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
1068
+ /* .synchronize = */ NULL, /* ggml_backend_opencl_synchronize */
1069
+ /* .graph_plan_create = */ NULL,
1070
+ /* .graph_plan_free = */ NULL,
1071
+ /* .graph_plan_update = */ NULL,
1072
+ /* .graph_plan_compute = */ NULL,
1073
+ /* .graph_compute = */ ggml_backend_opencl_graph_compute,
1074
+ /* .event_record = */ NULL,
1075
+ /* .event_wait = */ NULL,
1076
+ };
1077
+
1078
+ ggml_backend_t ggml_backend_opencl_init(void) {
1079
+ ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
1080
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
1081
+
1082
+ ggml_backend_t backend = new ggml_backend {
1083
+ /* .guid = */ ggml_backend_opencl_guid(),
1084
+ /* .interface = */ ggml_backend_opencl_i,
1085
+ /* .device = */ dev,
1086
+ /* .context = */ backend_ctx
1087
+ };
1088
+
1089
+ return backend;
1090
+ }
1091
+
1092
+ bool ggml_backend_is_opencl(ggml_backend_t backend) {
1093
+ return backend && backend->iface.get_name == ggml_backend_opencl_name;
1094
+ }
1095
+
1096
+ //
1097
+ // buffer
1098
+ //
1099
+ struct ggml_backend_opencl_buffer_context {
1100
+ // A buffer context can hold multiple cl_mem objects. This is for flattening
1101
+ // quantized weights and should be used with GGML_OPENCL_SMALL_ALLOC where
1102
+ // each tensor is allocated a separate buffer. When flattening is enabled
1103
+ // with small allocation, each tensor is backed by two cl_mem objects (for
1104
+ // quants and scales) packed into a backend_opencl_buffer.
1105
+ ggml_backend_opencl_buffer_context(cl_mem buf)
1106
+ : name("OpenCL") {
1107
+ buffer.push_back(buf);
1108
+ }
1109
+
1110
+ ~ggml_backend_opencl_buffer_context() {
1111
+ for (cl_mem buf : buffer) {
1112
+ CL_CHECK(clReleaseMemObject(buf));
1113
+ }
1114
+ for (cl_mem im : img) {
1115
+ CL_CHECK(clReleaseMemObject(im));
1116
+ }
1117
+
1118
+ // Delete all extras to trigger their destructors
1119
+ for (ggml_tensor_extra_cl * e : temp_tensor_extras) {
1120
+ delete e;
1121
+ }
1122
+ for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
1123
+ delete e;
1124
+ }
1125
+ for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0) {
1126
+ delete e;
1127
+ }
1128
+ for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
1129
+ delete e;
1130
+ }
1131
+ }
1132
+
1133
+ ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
1134
+ ggml_tensor_extra_cl * extra;
1135
+ if (temp_tensor_extras.empty()) {
1136
+ extra = new ggml_tensor_extra_cl();
1137
+ } else {
1138
+ extra = temp_tensor_extras.back();
1139
+ temp_tensor_extras.pop_back();
1140
+ }
1141
+
1142
+ temp_tensor_extras_in_use.push_back(extra);
1143
+
1144
+ extra->reset();
1145
+ return extra;
1146
+ }
1147
+
1148
+ ggml_tensor_extra_cl_q4_0 * ggml_opencl_alloc_temp_tensor_extra_q4_0() {
1149
+ ggml_tensor_extra_cl_q4_0 * extra;
1150
+ if (temp_tensor_extras_q4_0.empty()) {
1151
+ extra = new ggml_tensor_extra_cl_q4_0();
1152
+ } else {
1153
+ extra = temp_tensor_extras_q4_0.back();
1154
+ temp_tensor_extras_q4_0.pop_back();
1155
+ }
1156
+
1157
+ temp_tensor_extras_q4_0_in_use.push_back(extra);
1158
+
1159
+ extra->reset();
1160
+ return extra;
1161
+ }
1162
+
1163
+ void reset() {
1164
+ for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
1165
+ temp_tensor_extras.push_back(e);
1166
+ }
1167
+ temp_tensor_extras_in_use.clear();
1168
+
1169
+ for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
1170
+ temp_tensor_extras_q4_0.push_back(e);
1171
+ }
1172
+ temp_tensor_extras_q4_0_in_use.clear();
1173
+ }
1174
+
1175
+ // Pools for extras. Available extras are in `temp_tensor_extras`. Extras
1176
+ // being used are in `temp_tensor_extras_in_use`. At the first run, new
1177
+ // extras get created and put in `in_use`. When the buffer is reset via
1178
+ // the `reset` callback, all extras in `in_use` get moved to available extras
1179
+ // for reuse.
1180
+ std::vector<ggml_tensor_extra_cl *> temp_tensor_extras;
1181
+ std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
1182
+ std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
1183
+ std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
1184
+
1185
+ // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
1186
+ // before any tensor is initialized (at the beginning of alloc_tensor_range).
1187
+ // Hence, there is alway a buffer object in this vector. When each tensor is
1188
+ // being initialized, this original buffer object will be released if both
1189
+ // flattening and small allocation are enabled, and additional buffer
1190
+ // objects will be created in init_tensor to represent flattened quantized
1191
+ // weights.
1192
+ std::vector<cl_mem> buffer;
1193
+ // These are image1d_buffer_t objects that wrap around the quants and scales.
1194
+ // For Q4_0 quantization, there should be two of them - one for quants and
1195
+ // one for scales. They should be populated only when flattening and small
1196
+ // allocation are enabled.
1197
+ std::vector<cl_mem> img;
1198
+ std::string name;
1199
+ };
1200
+
1201
+ static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
1202
+
1203
+ static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1204
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1205
+ delete ctx;
1206
+ }
1207
+
1208
+ static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
1209
+ return cl_ptr_base;
1210
+
1211
+ GGML_UNUSED(buffer);
1212
+ }
1213
+
1214
+ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1215
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1216
+
1217
+ ggml_cl2_init(buffer->buft->device);
1218
+
1219
+ if (tensor->view_src != nullptr) {
1220
+ GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
1221
+
1222
+ ggml_tensor_extra_cl * view_extra = (ggml_tensor_extra_cl *) tensor->view_src->extra;
1223
+ GGML_ASSERT(view_extra && "view_extra is nullptr?");
1224
+
1225
+ // Reuse extra of the parent tensor. The offset of this view tensor
1226
+ // becomes `extra->offset + view_offs` and needs to be calculated when
1227
+ // it is used. This changes is needed because of the change to
1228
+ // ggml_alloc.c in https://github.com/ggerganov/llama.cpp/pull/7640.
1229
+ // `buffer` passed in here will always be `tensor->buffer`. It is OK
1230
+ // to allocate extras from the same buffer context for ordinary
1231
+ // intermediate tensors. But for views into kv cache tensors, doing so
1232
+ // would mess up the extras used by kv cache.
1233
+ // Before #7640, `buffer` is for intermediate tensors, which is always
1234
+ // different from that of kv cache tensors.
1235
+ //
1236
+ // NB: now extra->offset no longer accounts for view_offs.
1237
+ // NB: this should not apply to weight tensors (for end-to-end runs, but
1238
+ // may apply for test-backend-ops).
1239
+ // FIXME: if any unexpected results are seen, double check the offset -
1240
+ // there could be other places that need fix.
1241
+ tensor->extra = view_extra;
1242
+ } else {
1243
+ {
1244
+ size_t offset = (char *)tensor->data - (char *)cl_ptr_base;
1245
+
1246
+ ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
1247
+ extra->offset = offset;
1248
+ extra->data_device = ctx->buffer[0];
1249
+ extra->actual_size = ggml_nbytes(tensor);
1250
+
1251
+ tensor->extra = extra;
1252
+ }
1253
+ }
1254
+ }
1255
+
1256
+ // The optimized gemm and gemv kernels are used for large matrices without batch.
1257
+ // tensor is the quantized weights matrix.
1258
+ inline bool use_adreno_kernels(const ggml_tensor *tensor) {
1259
+ return tensor->ne[0] >= 512 && tensor->ne[1] >= 512 &&
1260
+ tensor->ne[2] == 1 && tensor->ne[3] == 1;
1261
+ }
1262
+
1263
+ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1264
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
1265
+
1266
+ cl_context context = backend_ctx->context;
1267
+ cl_command_queue queue = backend_ctx->queue;
1268
+
1269
+ #ifdef GGML_OPENCL_SOA_Q
1270
+ // We separate the quantized bits and scale from block_q4_0 by using an
1271
+ // additional kernel, where each thread handles a block. We first read the
1272
+ // original weights into a temporary buffer, then create two separate
1273
+ // buffers for quantized bits and scales, which are then populated by the
1274
+ // conversion kernel.
1275
+ if (tensor->type == GGML_TYPE_Q4_0) {
1276
+ // Tensors should have been preallocated, therefore they should
1277
+ // already have ggml_tensor_extra_cl as extra.
1278
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
1279
+ GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
1280
+
1281
+ // Allocate the new extra and create aliases from the original.
1282
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1283
+ ggml_tensor_extra_cl_q4_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_0();
1284
+
1285
+ size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
1286
+ size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
1287
+ GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
1288
+
1289
+ cl_int err;
1290
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
1291
+ ggml_nbytes(tensor), NULL, &err);
1292
+ CL_CHECK(err);
1293
+ CL_CHECK(clEnqueueWriteBuffer(
1294
+ queue, data_device, CL_TRUE, 0,
1295
+ ggml_nbytes(tensor), data, 0, NULL, NULL));
1296
+
1297
+ // We consider the specified offset arg as always, although For weights
1298
+ // the offset arg should be 0 (we do not assert this).
1299
+ //GGML_ASSERT(offset == 0);
1300
+
1301
+ // We create subbuffers from the original tensor buffer for scales and
1302
+ // quants - i.e., scales and quants are aliases into the buffer obejct
1303
+ // that backs the original tensor. This is a cleaner way to adapt to the
1304
+ // new memory management.
1305
+ // In the old code, we allocate new buffers for scales and quants
1306
+ // respectively, which could still be done but would result in double
1307
+ // allocation; properly deallocating the preallocated buffer that backs
1308
+ // the tensors is tricky and would leak the backend specific information
1309
+ // into the general backend code.
1310
+ // Does this create misaligned subbuffers (alignment is 1024) in certain
1311
+ // cases ?
1312
+ cl_buffer_region region;
1313
+
1314
+ // The original tensor memory is divided into scales and quants, i.e.,
1315
+ // we first store scales, then quants.
1316
+ // Create subbuffer for scales.
1317
+ region.origin = extra_orig->offset + tensor->view_offs + offset;
1318
+ region.size = size_d;
1319
+ extra->d = clCreateSubBuffer(
1320
+ extra_orig->data_device, CL_MEM_READ_WRITE,
1321
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
1322
+ CL_CHECK(err);
1323
+
1324
+ // Create subbuffer for quants.
1325
+ region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
1326
+ region.size = size_q;
1327
+ extra->q = clCreateSubBuffer(
1328
+ extra_orig->data_device, CL_MEM_READ_WRITE,
1329
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
1330
+ CL_CHECK(err);
1331
+
1332
+ //cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
1333
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1334
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
1335
+
1336
+ // The optimized kernels need weights in natural order, so unshuffle.
1337
+ if (use_adreno_kernels(tensor)) {
1338
+ kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
1339
+ }
1340
+ #else
1341
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
1342
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1343
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
1344
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
1345
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
1346
+
1347
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
1348
+ size_t local_work_size[] = {64, 1, 1};
1349
+
1350
+ cl_event evt;
1351
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
1352
+ CL_CHECK(clWaitForEvents(1, &evt));
1353
+ CL_CHECK(clReleaseMemObject(data_device));
1354
+
1355
+ tensor->extra = extra;
1356
+
1357
+ // transpose the weights and scales
1358
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1359
+ // Only do transpose for large, non batched matrix
1360
+ // TODO: use preallocated images instead of sub-buffer then image
1361
+ if (use_adreno_kernels(tensor)) {
1362
+ // <----------------------------------------------------------------------------------> //
1363
+ // start transpose
1364
+ // <----------------------------------------------------------------------------------> //
1365
+ int M = tensor->ne[1]; // ne01
1366
+ int K = tensor->ne[0]; // ne00
1367
+
1368
+ // transpose is out of place, so we need to allocate transposed buffers
1369
+ // <----------------------------------------------------------------------------------> //
1370
+ // use sub_buffer of max buffer size instead
1371
+
1372
+ size_t q_size_bytes = K * M / 8 * sizeof(float);
1373
+ cl_buffer_region region;
1374
+ region.origin = 0;
1375
+ region.size = q_size_bytes;
1376
+ cl_mem qT_d = clCreateSubBuffer(
1377
+ backend_ctx->A_q_d_max,
1378
+ 0,
1379
+ CL_BUFFER_CREATE_TYPE_REGION,
1380
+ &region,
1381
+ &err);
1382
+ // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
1383
+ CL_CHECK(err);
1384
+
1385
+ // size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
1386
+ size_t d_size_bytes = M * (K / 32) * 2;
1387
+ region.origin = 0;
1388
+ region.size = d_size_bytes;
1389
+ cl_mem dT_d = clCreateSubBuffer(
1390
+ backend_ctx->A_s_d_max,
1391
+ 0,
1392
+ CL_BUFFER_CREATE_TYPE_REGION,
1393
+ &region,
1394
+ &err);
1395
+ // cl_mem dT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, d_size_bytes, NULL, &err);
1396
+ CL_CHECK(err);
1397
+
1398
+ // <----------------------------------------------------------------------------------> //
1399
+
1400
+
1401
+ // create images from the buffers
1402
+ // <----------------------------------------------------------------------------------> //
1403
+ cl_mem q_d_image1D;
1404
+ cl_mem d_d_image1D;
1405
+ cl_mem qT_d_image1D;
1406
+ cl_mem dT_d_image1D;
1407
+
1408
+ cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
1409
+ cl_image_desc img_desc_1d;
1410
+
1411
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1412
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1413
+ img_desc_1d.image_width = M * K / 8 / 4;
1414
+ img_desc_1d.buffer = extra->q;
1415
+ q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1416
+ CL_CHECK(err);
1417
+
1418
+ img_fmt_1d = { CL_RGBA, CL_FLOAT };
1419
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1420
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1421
+ img_desc_1d.image_width = M * K / 8 / 4;
1422
+ img_desc_1d.buffer = qT_d;
1423
+ qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1424
+ CL_CHECK(err);
1425
+
1426
+ img_fmt_1d = { CL_RGBA, CL_FLOAT };
1427
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1428
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1429
+ img_desc_1d.image_width = M * K / 32 / 4 / 2;
1430
+ img_desc_1d.buffer = extra->d;
1431
+ d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1432
+ CL_CHECK(err);
1433
+
1434
+ img_fmt_1d = { CL_RGBA, CL_FLOAT };
1435
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1436
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1437
+ img_desc_1d.image_width = M * K / 32 / 4 / 2;
1438
+ img_desc_1d.buffer = dT_d;
1439
+ dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1440
+ CL_CHECK(err);
1441
+ // <----------------------------------------------------------------------------------> //
1442
+
1443
+ // set up and call the transpose kernels
1444
+ // <----------------------------------------------------------------------------------> //
1445
+ // weights
1446
+ int height_q = M / 8;
1447
+ int width_q = K / 8 / 4;
1448
+ kernel = backend_ctx->kernel_transpose_16;
1449
+
1450
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
1451
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
1452
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
1453
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
1454
+
1455
+ size_t local_size_q[3] = {4, 16, 1};
1456
+ size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
1457
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
1458
+ CL_CHECK(clWaitForEvents(1, &evt));
1459
+
1460
+ // scales
1461
+ int height_s = M / 8;
1462
+ int width_s = K / 32 / 8;
1463
+
1464
+ kernel = backend_ctx->kernel_transpose_16;
1465
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
1466
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
1467
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
1468
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
1469
+
1470
+ size_t local_size_s[3] = {4, 16, 1};
1471
+ size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
1472
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
1473
+ CL_CHECK(clWaitForEvents(1, &evt));
1474
+ // <----------------------------------------------------------------------------------> //
1475
+
1476
+ // copy transposed buffer contents to original buffers
1477
+ // <----------------------------------------------------------------------------------> //
1478
+ // weights
1479
+ CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
1480
+ CL_CHECK(clWaitForEvents(1, &evt));
1481
+
1482
+ // scales
1483
+ CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
1484
+ CL_CHECK(clWaitForEvents(1, &evt));
1485
+ // <----------------------------------------------------------------------------------> //
1486
+
1487
+ // deallocate transpose buffers
1488
+ // <----------------------------------------------------------------------------------> //
1489
+ CL_CHECK(clReleaseMemObject(qT_d));
1490
+ CL_CHECK(clReleaseMemObject(dT_d));
1491
+
1492
+ // deallocate temporary images
1493
+ CL_CHECK(clReleaseMemObject(q_d_image1D));
1494
+ CL_CHECK(clReleaseMemObject(d_d_image1D));
1495
+ CL_CHECK(clReleaseMemObject(qT_d_image1D));
1496
+ CL_CHECK(clReleaseMemObject(dT_d_image1D));
1497
+ // <----------------------------------------------------------------------------------> //
1498
+ // end transpose
1499
+ // <----------------------------------------------------------------------------------> //
1500
+ }
1501
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1502
+
1503
+ return;
1504
+ }
1505
+ #endif // GGML_OPENCL_SOA_Q
1506
+
1507
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
1508
+ GGML_ASSERT(extra);
1509
+
1510
+ CL_CHECK(clEnqueueWriteBuffer(
1511
+ queue, extra->data_device, CL_TRUE, extra->offset + offset,
1512
+ size, data, 0, NULL, NULL));
1513
+
1514
+ GGML_UNUSED(buffer);
1515
+ }
1516
+
1517
+ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1518
+ GGML_ASSERT(tensor->extra);
1519
+
1520
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
1521
+
1522
+ cl_context context = backend_ctx->context;
1523
+ cl_command_queue queue = backend_ctx->queue;
1524
+
1525
+ // Make sure all previously submitted commands are finished.
1526
+ CL_CHECK(clFinish(queue));
1527
+
1528
+ #ifdef GGML_OPENCL_SOA_Q
1529
+ // In end-to-end runs, get_tensor is usually used to get back the logits,
1530
+ // where we can simply do clEnqueueReadBuffer since they are f32.
1531
+ // However, in test-backend-ops, the GPU graph is copied to the CPU backend,
1532
+ // which requires reading back quantized weight tensors.
1533
+ // To properly support this, we need to restore block_q4_0 struct arrays
1534
+ // from the flattened buffers.
1535
+ if (tensor->type == GGML_TYPE_Q4_0) {
1536
+ ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
1537
+
1538
+ cl_int err;
1539
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
1540
+ ggml_nbytes(tensor), NULL, &err);
1541
+ CL_CHECK(err);
1542
+
1543
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0;
1544
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
1545
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
1546
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
1547
+
1548
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
1549
+ size_t local_work_size[] = {1, 1, 1};
1550
+
1551
+ cl_event evt;
1552
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
1553
+ global_work_size, local_work_size, 0, NULL, &evt));
1554
+ CL_CHECK(clWaitForEvents(1, &evt));
1555
+ CL_CHECK(clEnqueueReadBuffer(
1556
+ queue, data_device, CL_TRUE, offset,
1557
+ size, data, 0, NULL, NULL));
1558
+ CL_CHECK(clReleaseMemObject(data_device));
1559
+ return;
1560
+ }
1561
+ #endif // GGML_OPENCL_SOA_Q
1562
+
1563
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
1564
+
1565
+ CL_CHECK(clEnqueueReadBuffer(
1566
+ queue, extra->data_device, CL_TRUE, extra->offset + tensor->view_offs + offset,
1567
+ size, data, 0, NULL, NULL));
1568
+
1569
+ GGML_UNUSED(buffer);
1570
+ }
1571
+
1572
+ static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1573
+ ggml_backend_dev_t dev = buffer->buft->device;
1574
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
1575
+ cl_command_queue queue = backend_ctx->queue;
1576
+
1577
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1578
+ for (cl_mem buf : ctx->buffer) {
1579
+ CL_CHECK(clEnqueueFillBuffer(queue, buf, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
1580
+ }
1581
+ CL_CHECK(clFinish(queue));
1582
+ }
1583
+
1584
+ static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
1585
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1586
+ ctx->reset();
1587
+ }
1588
+
1589
+ static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
1590
+ /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
1591
+ /* .get_base = */ ggml_backend_opencl_buffer_get_base,
1592
+ /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
1593
+ /* .memset_tensor = */ NULL,
1594
+ /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
1595
+ /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
1596
+ /* .cpy_tensor = */ NULL,
1597
+ /* .clear = */ ggml_backend_opencl_buffer_clear,
1598
+ /* .reset = */ ggml_backend_opencl_buffer_reset,
1599
+ };
1600
+
1601
+ //
1602
+ // buffer type
1603
+ //
1604
+
1605
+ static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type) {
1606
+ return "OpenCL";
1607
+
1608
+ GGML_UNUSED(buffer_type);
1609
+ }
1610
+
1611
+ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
1612
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device);
1613
+
1614
+ // clCreateBuffer returns -61 for size 0
1615
+ size = std::max(size, (size_t)1);
1616
+
1617
+ cl_int err;
1618
+ cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err);
1619
+ if (err != CL_SUCCESS) {
1620
+ GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
1621
+ return nullptr;
1622
+ }
1623
+
1624
+ ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context(mem);
1625
+
1626
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
1627
+ }
1628
+
1629
+ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
1630
+ // FIXME: not thread safe, device may not be initialized yet
1631
+ static cl_uint alignment = -1;
1632
+ if (alignment == (cl_uint)-1) {
1633
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
1634
+ alignment = backend_ctx->alignment;
1635
+ }
1636
+ return alignment;
1637
+ }
1638
+
1639
+ static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
1640
+ static size_t max_size = -1;
1641
+ if (max_size == (size_t)-1) {
1642
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
1643
+ max_size = backend_ctx->max_alloc_size;
1644
+ }
1645
+ return max_size;
1646
+ }
1647
+
1648
+ static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
1649
+ return ggml_backend_is_opencl(backend);
1650
+
1651
+ UNUSED(buft);
1652
+ }
1653
+
1654
+ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
1655
+ /* .get_name = */ ggml_backend_opencl_buffer_type_get_name,
1656
+ /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
1657
+ /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
1658
+ /* .get_max_size = */ ggml_backend_opencl_buffer_type_get_max_size,
1659
+ /* .get_alloc_size = */ NULL,
1660
+ /* .is_host = */ NULL,
1661
+ };
1662
+
1663
+ ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
1664
+ static ggml_backend_buffer_type buffer_type = {
1665
+ /* .iface = */ ggml_backend_opencl_buffer_type_interface,
1666
+ /* .device = */ &g_ggml_backend_opencl_device,
1667
+ /* .context = */ nullptr,
1668
+ };
1669
+
1670
+ return &buffer_type;
1671
+ }
1672
+
1673
+ //
1674
+ // backend device
1675
+ //
1676
+
1677
+ static const char * ggml_backend_opencl_device_get_name(ggml_backend_dev_t dev) {
1678
+ return "GPUOpenCL";
1679
+
1680
+ GGML_UNUSED(dev);
1681
+ }
1682
+
1683
+ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_t dev) {
1684
+ ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
1685
+ return dev_ctx->device_name.c_str();
1686
+ }
1687
+
1688
+ static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
1689
+ *free = 1;
1690
+ *total = 1;
1691
+
1692
+ GGML_UNUSED(dev);
1693
+ }
1694
+
1695
+ static enum ggml_backend_dev_type ggml_backend_opencl_device_get_type(ggml_backend_dev_t dev) {
1696
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
1697
+
1698
+ GGML_UNUSED(dev);
1699
+ }
1700
+
1701
+ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
1702
+ props->name = ggml_backend_opencl_device_get_name(dev);
1703
+ props->description = ggml_backend_opencl_device_get_description(dev);
1704
+ props->type = ggml_backend_opencl_device_get_type(dev);
1705
+ ggml_backend_opencl_device_get_memory(dev, &props->memory_free, &props->memory_total);
1706
+ props->caps = ggml_backend_dev_caps {
1707
+ /* .async = */ false,
1708
+ /* .host_buffer = */ false,
1709
+ /* .buffer_from_host_ptr = */ false,
1710
+ /* .events = */ false,
1711
+ };
1712
+ }
1713
+
1714
+ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
1715
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
1716
+
1717
+ ggml_backend_t backend = new ggml_backend {
1718
+ /* .guid = */ ggml_backend_opencl_guid(),
1719
+ /* .interface = */ ggml_backend_opencl_i,
1720
+ /* .device = */ dev,
1721
+ /* .context = */ backend_ctx,
1722
+ };
1723
+
1724
+ return backend;
1725
+
1726
+ GGML_UNUSED(params);
1727
+ }
1728
+
1729
+ static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
1730
+ return ggml_backend_opencl_buffer_type();
1731
+
1732
+ GGML_UNUSED(dev);
1733
+ }
1734
+
1735
+ static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1736
+ GGML_UNUSED(dev);
1737
+ GGML_UNUSED(ptr);
1738
+ GGML_UNUSED(size);
1739
+ GGML_UNUSED(max_tensor_size);
1740
+ return nullptr;
1741
+ }
1742
+
1743
+ static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
1744
+ return ggml_opencl_supports_op(dev, op);
1745
+ }
1746
+
1747
+ static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
1748
+ return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;
1749
+
1750
+ GGML_UNUSED(dev);
1751
+ }
1752
+
1753
+ static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
1754
+ /* .get_name = */ ggml_backend_opencl_device_get_name,
1755
+ /* .get_description = */ ggml_backend_opencl_device_get_description,
1756
+ /* .get_memory = */ ggml_backend_opencl_device_get_memory,
1757
+ /* .get_type = */ ggml_backend_opencl_device_get_type,
1758
+ /* .get_props = */ ggml_backend_opencl_device_get_props,
1759
+ /* .init_backend = */ ggml_backend_opencl_device_init,
1760
+ /* .get_buffer_type = */ ggml_backend_opencl_device_get_buffer_type,
1761
+ /* .get_host_buffer_type = */ NULL,
1762
+ /* .buffer_from_host_ptr = */ ggml_backend_opencl_device_buffer_from_ptr,
1763
+ /* .supports_op = */ ggml_backend_opencl_device_supports_op,
1764
+ /* .supports_buft = */ ggml_backend_opencl_device_supports_buft,
1765
+ /* .offload_op = */ NULL,
1766
+ /* .event_new = */ NULL,
1767
+ /* .event_free = */ NULL,
1768
+ /* .event_synchronize = */ NULL,
1769
+ };
1770
+
1771
+ // Backend registry
1772
+
1773
+ static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
1774
+ return "OpenCL";
1775
+
1776
+ GGML_UNUSED(reg);
1777
+ }
1778
+
1779
+ static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
1780
+ return ggml_backend_opencl_n_devices;
1781
+
1782
+ GGML_UNUSED(reg);
1783
+ }
1784
+
1785
+ static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
1786
+ GGML_ASSERT(index == 0);
1787
+
1788
+ return &g_ggml_backend_opencl_device;
1789
+
1790
+ GGML_UNUSED(reg);
1791
+ GGML_UNUSED(index);
1792
+ }
1793
+
1794
+ static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
1795
+ /* .get_name = */ ggml_backend_opencl_reg_get_name,
1796
+ /* .device_count = */ ggml_backend_opencl_reg_device_count,
1797
+ /* .device_get = */ ggml_backend_opencl_reg_device_get,
1798
+ /* .get_proc_address = */ NULL,
1799
+ };
1800
+
1801
+ ggml_backend_reg_t ggml_backend_opencl_reg(void) {
1802
+ // TODO: make this thread-safe somehow?
1803
+ static ggml_backend_reg reg;
1804
+ static bool initialized = false;
1805
+
1806
+ if (!initialized) {
1807
+ reg = ggml_backend_reg {
1808
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
1809
+ /* .iface = */ ggml_backend_opencl_reg_i,
1810
+ /* .context = */ NULL,
1811
+ };
1812
+
1813
+ g_ggml_backend_opencl_device = ggml_backend_device {
1814
+ /* .iface = */ ggml_backend_opencl_device_i,
1815
+ /* .reg = */ &reg,
1816
+ /* .context = */ &g_ggml_ctx_dev_main,
1817
+ };
1818
+
1819
+ ggml_cl2_init(&g_ggml_backend_opencl_device);
1820
+
1821
+ initialized = true;
1822
+ }
1823
+
1824
+ return &reg;
1825
+ }
1826
+
1827
+ GGML_BACKEND_DL_IMPL(ggml_backend_opencl_reg)
1828
+
1829
+ //------------------------------------------------------------------------------
1830
+ // Debugging utils
1831
+ //------------------------------------------------------------------------------
1832
+ #if 0
1833
+ #define QK4_0 32
1834
+ typedef struct {
1835
+ ggml_fp16_t d; // delta
1836
+ uint8_t qs[QK4_0 / 2]; // nibbles / quants
1837
+ } block_q4_0;
1838
+ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2,
1839
+ "wrong q4_0 block size/padding");
1840
+
1841
+ #include <math.h>
1842
+ #ifdef __cplusplus
1843
+ #include "half.hpp"
1844
+ #endif
1845
+
1846
+ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tensor) {
1847
+ void * buf = malloc(ggml_nbytes(tensor));
1848
+
1849
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
1850
+ cl_command_queue queue = backend_ctx->queue;
1851
+ #ifdef GGML_OPENCL_SOA_Q
1852
+ void * buf_q;
1853
+ void * buf_d;
1854
+ #endif
1855
+
1856
+ #ifdef GGML_USE_OPENCL
1857
+ // Make sure everything is done.
1858
+ CL_CHECK(clFinish(queue));
1859
+
1860
+ #ifdef GGML_OPENCL_SOA_Q
1861
+ if (tensor->type == GGML_TYPE_Q4_0) {
1862
+ ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *) tensor->extra;
1863
+ GGML_ASSERT(extra);
1864
+
1865
+ size_t size_q = ggml_nelements(tensor)/QK4_0 * QK4_0/2;
1866
+ size_t size_d = ggml_nelements(tensor)/QK4_0 * sizeof(ggml_fp16_t);
1867
+ GGML_ASSERT(size_q + size_d == ggml_nbytes(tensor));
1868
+ buf_q = malloc(size_q);
1869
+ buf_d = malloc(size_d);
1870
+
1871
+ CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
1872
+ CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL));
1873
+ CL_CHECK(clFinish(queue));
1874
+ } else {
1875
+ // Read out the tensor from GPU memory.
1876
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
1877
+ GGML_ASSERT(extra);
1878
+
1879
+ CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
1880
+ extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
1881
+ CL_CHECK(clFinish(queue));
1882
+ }
1883
+ #else
1884
+ // Read out the tensor from GPU memory.
1885
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
1886
+ GGML_ASSERT(extra);
1887
+
1888
+ CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
1889
+ extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
1890
+ CL_CHECK(clFinish(queue));
1891
+ #endif // GGML_OPENCL_SOA_Q
1892
+ #endif // GGML_USE_OPENCL
1893
+
1894
+ // Open file and dump.
1895
+ char fname[512];
1896
+ sprintf(fname, "./tensor-dumps/%s.txt", tensor->name);
1897
+ FILE * f = fopen(fname, "w");
1898
+ if (!f) {
1899
+ printf("Failed to open %s\n", fname);
1900
+ return;
1901
+ }
1902
+
1903
+ if (tensor->type == GGML_TYPE_F32) {
1904
+ float * data = (float *) buf;
1905
+ for (int i = 0; i < ggml_nelements(tensor); ++i) {
1906
+ if (isnan(data[i])) {
1907
+ printf("NaN found: %s\n", tensor->name);
1908
+ break;
1909
+ }
1910
+ fprintf(f, "%f\n", data[i]);
1911
+ }
1912
+ } else if (tensor->type == GGML_TYPE_I32) {
1913
+ int * data = (int *) buf;
1914
+ for (int i = 0; i < ggml_nelements(tensor); ++i) {
1915
+ if (isnan(data[i])) {
1916
+ printf("NaN found: %s\n", tensor->name);
1917
+ break;
1918
+ }
1919
+ fprintf(f, "%d\n", data[i]);
1920
+ }
1921
+ } else if (tensor->type == GGML_TYPE_F16) {
1922
+ #ifdef __cplusplus
1923
+ half_float::half * data = (half_float::half *) buf;
1924
+ for (int i = 0; i < ggml_nelements(tensor); ++i) {
1925
+ if (std::isnan(data[i])) {
1926
+ printf("NaN found: %s\n", tensor->name);
1927
+ break;
1928
+ }
1929
+ fprintf(f, "%f\n", float(data[i]));
1930
+ }
1931
+ #endif
1932
+ } else if (tensor->type == GGML_TYPE_Q4_0) {
1933
+ #ifdef GGML_OPENCL_SOA_Q
1934
+ ggml_fp16_t * data_d = (ggml_fp16_t *)buf_d;
1935
+ unsigned char * data_q = (unsigned char *)buf_q;
1936
+
1937
+ for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
1938
+ fprintf(f, "%04x, ", data_d[i]);
1939
+ for (int k = 0; k < QK4_0/2; ++k) {
1940
+ fprintf(f, "%02x, ", data_q[k]);
1941
+ }
1942
+ fprintf(f, "\n");
1943
+ data_q += QK4_0/2;
1944
+ }
1945
+ free(buf_d);
1946
+ free(buf_q);
1947
+ #else
1948
+ block_q4_0 * data = (block_q4_0 *) buf;
1949
+ for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
1950
+ fprintf(f, "%04x, ", data[i].d);
1951
+ for (int k = 0; k < QK4_0/2; ++k) {
1952
+ fprintf(f, "%02x, ", data[i].qs[k]);
1953
+ }
1954
+ fprintf(f, "\n");
1955
+ }
1956
+ #endif // GGML_OPENCL_SOA_Q
1957
+ }
1958
+ free(buf);
1959
+ fflush(f);
1960
+ fclose(f);
1961
+ }
1962
+ #else
1963
+ #define dump_tensor(tensor)
1964
+ #endif
1965
+
1966
+ //------------------------------------------------------------------------------
1967
+ // Profiling utility
1968
+ //------------------------------------------------------------------------------
1969
+ #ifdef GGML_OPENCL_PROFILING
1970
+ void populateProfilingInfo(
1971
+ ProfilingInfo& info, cl_event evt, cl_kernel kernel,
1972
+ size_t global_size[3], size_t local_size[3],
1973
+ const ggml_tensor * tensor) {
1974
+ cl_ulong start;
1975
+ cl_ulong end;
1976
+ CL_CHECK(clWaitForEvents(1, &evt));
1977
+ CL_CHECK(clGetEventProfilingInfo(
1978
+ evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL));
1979
+ CL_CHECK(clGetEventProfilingInfo(
1980
+ evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
1981
+
1982
+ char kernel_name[512];
1983
+ CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
1984
+ sizeof(kernel_name), kernel_name, NULL));
1985
+
1986
+ info.duration_ns = end - start;
1987
+ info.op_name = tensor->name;
1988
+ info.kernel_name = kernel_name;
1989
+ info.local_size[0] = local_size[0];
1990
+ info.local_size[1] = local_size[1];
1991
+ info.local_size[2] = local_size[2];
1992
+ info.global_size[0] = global_size[0];
1993
+ info.global_size[1] = global_size[1];
1994
+ info.global_size[2] = global_size[2];
1995
+ info.output_size[0] = tensor->ne[0];
1996
+ info.output_size[1] = tensor->ne[1];
1997
+ info.output_size[2] = tensor->ne[2];
1998
+ info.output_size[3] = tensor->ne[3];
1999
+ }
2000
+ #endif
2001
+
2002
+ //------------------------------------------------------------------------------
2003
+ // Ops
2004
+ //------------------------------------------------------------------------------
2005
+
2006
+ static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
2007
+ const int64_t ne10 = src1->ne[0];
2008
+
2009
+ const int64_t ne0 = dst->ne[0];
2010
+ const int64_t ne1 = dst->ne[1];
2011
+
2012
+ // TODO: find the optimal values for these
2013
+ return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2014
+ src1->type == GGML_TYPE_F32 &&
2015
+ dst->type == GGML_TYPE_F32 &&
2016
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
2017
+ }
2018
+
2019
+ static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2020
+ UNUSED(backend);
2021
+ UNUSED(src0);
2022
+ UNUSED(src1);
2023
+ UNUSED(dst);
2024
+ }
2025
+
2026
+ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2027
+ GGML_ASSERT(src0);
2028
+ GGML_ASSERT(src0->extra);
2029
+ GGML_ASSERT(src1);
2030
+ GGML_ASSERT(src1->extra);
2031
+ GGML_ASSERT(dst);
2032
+ GGML_ASSERT(dst->extra);
2033
+
2034
+ const int ne00 = src0 ? src0->ne[0] : 0;
2035
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2036
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2037
+ const int ne10 = src1 ? src1->ne[0] : 0;
2038
+ const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
2039
+ const int ne11 = src1 ? src1->ne[1] : 0;
2040
+ const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
2041
+ const cl_ulong nb1 = dst ? dst->nb[1] : 0;
2042
+ const cl_ulong nb2 = dst ? dst->nb[2] : 0;
2043
+
2044
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2045
+ cl_command_queue queue = backend_ctx->queue;
2046
+
2047
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2048
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
2049
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2050
+
2051
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
2052
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
2053
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
2054
+
2055
+ cl_kernel kernel;
2056
+
2057
+ switch (src0->type) {
2058
+ case GGML_TYPE_F32:
2059
+ kernel = backend_ctx->kernel_get_rows_f32;
2060
+ break;
2061
+ case GGML_TYPE_F16:
2062
+ kernel = backend_ctx->kernel_get_rows_f16;
2063
+ break;
2064
+ case GGML_TYPE_Q4_0:
2065
+ kernel = backend_ctx->kernel_get_rows_q4_0;
2066
+ break;
2067
+ default:
2068
+ GGML_ASSERT(false && "not implemented");
2069
+ }
2070
+
2071
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2072
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2073
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
2074
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
2075
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
2076
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
2077
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
2078
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
2079
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
2080
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
2081
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb10));
2082
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
2083
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
2084
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
2085
+
2086
+ size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
2087
+ size_t local_work_size[] = {1, 1, 1};
2088
+
2089
+ #ifdef GGML_OPENCL_PROFILING
2090
+ cl_event evt;
2091
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2092
+
2093
+ g_profiling_info.emplace_back();
2094
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2095
+ #else
2096
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2097
+ #endif
2098
+ }
2099
+
2100
+ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2101
+ GGML_ASSERT(src0);
2102
+ GGML_ASSERT(src0->extra);
2103
+ GGML_ASSERT(src1);
2104
+ GGML_ASSERT(src1->extra);
2105
+ GGML_ASSERT(dst);
2106
+ GGML_ASSERT(dst->extra);
2107
+
2108
+ const int ne00 = src0 ? src0->ne[0] : 0;
2109
+ const int ne01 = src0 ? src0->ne[1] : 0;
2110
+ const int ne02 = src0 ? src0->ne[2] : 0;
2111
+ const int ne03 = src0 ? src0->ne[3] : 0;
2112
+
2113
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
2114
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2115
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2116
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
2117
+
2118
+ const int ne10 = src1 ? src1->ne[0] : 0;
2119
+ const int ne11 = src1 ? src1->ne[1] : 0;
2120
+ const int ne12 = src1 ? src1->ne[2] : 0;
2121
+ const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
2122
+
2123
+ const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
2124
+ const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
2125
+ const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
2126
+ const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
2127
+
2128
+ const int ne0 = dst ? dst->ne[0] : 0;
2129
+ const int ne1 = dst ? dst->ne[1] : 0;
2130
+ const int ne2 = dst ? dst->ne[2] : 0;
2131
+ const int ne3 = dst ? dst->ne[3] : 0;
2132
+
2133
+ const cl_ulong nb0 = dst ? dst->nb[0] : 0;
2134
+ const cl_ulong nb1 = dst ? dst->nb[1] : 0;
2135
+ const cl_ulong nb2 = dst ? dst->nb[2] : 0;
2136
+ const cl_ulong nb3 = dst ? dst->nb[3] : 0;
2137
+
2138
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2139
+ cl_command_queue queue = backend_ctx->queue;
2140
+
2141
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2142
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
2143
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2144
+
2145
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
2146
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
2147
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
2148
+
2149
+ bool bcast_row = false;
2150
+ cl_kernel kernel;
2151
+
2152
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
2153
+ GGML_ASSERT(ggml_is_contiguous(src0));
2154
+
2155
+ // src1 is a row
2156
+ GGML_ASSERT(ne11 == 1);
2157
+
2158
+ bcast_row = true;
2159
+ int ne = ne00 / 4;
2160
+ kernel = backend_ctx->kernel_add_row;
2161
+
2162
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2163
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2164
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
2165
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
2166
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
2167
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
2168
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
2169
+ } else {
2170
+ kernel = backend_ctx->kernel_add;
2171
+
2172
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2173
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2174
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
2175
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
2176
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
2177
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
2178
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
2179
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
2180
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
2181
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
2182
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
2183
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
2184
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
2185
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
2186
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
2187
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
2188
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
2189
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
2190
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
2191
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
2192
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
2193
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
2194
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
2195
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
2196
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
2197
+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
2198
+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
2199
+ CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
2200
+ CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
2201
+ CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
2202
+ }
2203
+
2204
+ if (bcast_row) {
2205
+ int n = ggml_nelements(dst)/4;
2206
+ size_t global_work_size[] = {(size_t)n, 1, 1};
2207
+ size_t local_work_size[] = {64, 1, 1};
2208
+
2209
+ #ifdef GGML_OPENCL_PROFILING
2210
+ cl_event evt;
2211
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2212
+
2213
+ g_profiling_info.emplace_back();
2214
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2215
+ #else
2216
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2217
+ #endif
2218
+ } else {
2219
+ unsigned int nth = MIN(64, ne0);
2220
+ size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
2221
+ size_t local_work_size[] = {nth, 1, 1};
2222
+
2223
+ #ifdef GGML_OPENCL_PROFILING
2224
+ cl_event evt;
2225
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2226
+
2227
+ g_profiling_info.emplace_back();
2228
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2229
+ #else
2230
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2231
+ #endif
2232
+ }
2233
+ }
2234
+
2235
+ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2236
+ GGML_ASSERT(src0);
2237
+ GGML_ASSERT(src0->extra);
2238
+ GGML_ASSERT(src1);
2239
+ GGML_ASSERT(src1->extra);
2240
+ GGML_ASSERT(dst);
2241
+ GGML_ASSERT(dst->extra);
2242
+
2243
+ const int ne00 = src0 ? src0->ne[0] : 0;
2244
+ const int ne01 = src0 ? src0->ne[1] : 0;
2245
+ const int ne02 = src0 ? src0->ne[2] : 0;
2246
+ const int ne03 = src0 ? src0->ne[3] : 0;
2247
+
2248
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
2249
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2250
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2251
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
2252
+
2253
+ const int ne10 = src1 ? src1->ne[0] : 0;
2254
+ const int ne11 = src1 ? src1->ne[1] : 0;
2255
+ const int ne12 = src1 ? src1->ne[2] : 0;
2256
+ const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
2257
+
2258
+ const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
2259
+ const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
2260
+ const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
2261
+ const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
2262
+
2263
+ const int ne0 = dst ? dst->ne[0] : 0;
2264
+ const int ne1 = dst ? dst->ne[1] : 0;
2265
+ const int ne2 = dst ? dst->ne[2] : 0;
2266
+ const int ne3 = dst ? dst->ne[3] : 0;
2267
+
2268
+ const cl_ulong nb0 = dst ? dst->nb[0] : 0;
2269
+ const cl_ulong nb1 = dst ? dst->nb[1] : 0;
2270
+ const cl_ulong nb2 = dst ? dst->nb[2] : 0;
2271
+ const cl_ulong nb3 = dst ? dst->nb[3] : 0;
2272
+
2273
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2274
+ cl_command_queue queue = backend_ctx->queue;
2275
+
2276
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2277
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
2278
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2279
+
2280
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
2281
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
2282
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
2283
+
2284
+ bool bcast_row = false;
2285
+ cl_kernel kernel;
2286
+
2287
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
2288
+ GGML_ASSERT(ggml_is_contiguous(src0));
2289
+
2290
+ // src1 is a row
2291
+ GGML_ASSERT(ne11 == 1);
2292
+
2293
+ bcast_row = true;
2294
+ int ne = ne00 / 4;
2295
+ kernel = backend_ctx->kernel_mul_row;
2296
+
2297
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2298
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2299
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
2300
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
2301
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
2302
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
2303
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
2304
+ } else {
2305
+ kernel = backend_ctx->kernel_mul;
2306
+
2307
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2308
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2309
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
2310
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
2311
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
2312
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
2313
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
2314
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
2315
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
2316
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
2317
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
2318
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
2319
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
2320
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
2321
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
2322
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
2323
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
2324
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
2325
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
2326
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
2327
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
2328
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
2329
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
2330
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
2331
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
2332
+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
2333
+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
2334
+ CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
2335
+ CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
2336
+ CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
2337
+ }
2338
+
2339
+ if (bcast_row) {
2340
+ int n = ggml_nelements(dst)/4;
2341
+ size_t global_work_size[] = {(size_t)n, 1, 1};
2342
+ size_t local_work_size[] = {64, 1, 1};
2343
+
2344
+ #ifdef GGML_OPENCL_PROFILING
2345
+ cl_event evt;
2346
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2347
+
2348
+ g_profiling_info.emplace_back();
2349
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2350
+ #else
2351
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2352
+ #endif
2353
+ } else {
2354
+ unsigned int nth = MIN(64, ne0);
2355
+ size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
2356
+ size_t local_work_size[] = {nth, 1, 1};
2357
+
2358
+ #ifdef GGML_OPENCL_PROFILING
2359
+ cl_event evt;
2360
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2361
+
2362
+ g_profiling_info.emplace_back();
2363
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2364
+ #else
2365
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2366
+ #endif
2367
+ }
2368
+ }
2369
+
2370
+ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2371
+ GGML_ASSERT(src0);
2372
+ GGML_ASSERT(src0->extra);
2373
+ GGML_ASSERT(dst);
2374
+ GGML_ASSERT(dst->extra);
2375
+
2376
+ UNUSED(src1);
2377
+
2378
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2379
+ cl_command_queue queue = backend_ctx->queue;
2380
+
2381
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2382
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2383
+
2384
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
2385
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
2386
+
2387
+ cl_kernel kernel;
2388
+
2389
+ int n = ggml_nelements(dst);
2390
+
2391
+ if (n % 4 == 0) {
2392
+ kernel = backend_ctx->kernel_gelu_4;
2393
+ n /= 4;
2394
+ } else {
2395
+ kernel = backend_ctx->kernel_gelu;
2396
+ }
2397
+
2398
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2399
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2400
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2401
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2402
+
2403
+ size_t global_work_size[] = {(size_t)n, 1, 1};
2404
+ size_t local_work_size[] = {64, 1, 1};
2405
+
2406
+ #ifdef GGML_OPENCL_PROFILING
2407
+ cl_event evt;
2408
+ clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
2409
+
2410
+ g_profiling_info.emplace_back();
2411
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2412
+ #else
2413
+ clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
2414
+ #endif
2415
+ }
2416
+
2417
+ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2418
+ GGML_ASSERT(src0);
2419
+ GGML_ASSERT(src0->extra);
2420
+ GGML_ASSERT(dst);
2421
+ GGML_ASSERT(dst->extra);
2422
+
2423
+ UNUSED(src1);
2424
+
2425
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2426
+ cl_command_queue queue = backend_ctx->queue;
2427
+
2428
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2429
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2430
+
2431
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
2432
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
2433
+
2434
+ cl_kernel kernel;
2435
+
2436
+ int n = ggml_nelements(dst);
2437
+
2438
+ if (n % 4 == 0) {
2439
+ kernel = backend_ctx->kernel_silu_4;
2440
+ n /= 4;
2441
+ } else {
2442
+ kernel = backend_ctx->kernel_silu;
2443
+ }
2444
+
2445
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2446
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2447
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2448
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2449
+
2450
+ size_t global_work_size[] = {(size_t)n, 1, 1};
2451
+ size_t local_work_size[] = {64, 1, 1};
2452
+
2453
+ #ifdef GGML_OPENCL_PROFILING
2454
+ cl_event evt;
2455
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2456
+
2457
+ g_profiling_info.emplace_back();
2458
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2459
+ #else
2460
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2461
+ #endif
2462
+ }
2463
+
2464
+ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2465
+ GGML_ASSERT(src0);
2466
+ GGML_ASSERT(src0->extra);
2467
+ GGML_ASSERT(dst);
2468
+ GGML_ASSERT(dst->extra);
2469
+
2470
+ UNUSED(src1);
2471
+
2472
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2473
+ cl_command_queue queue = backend_ctx->queue;
2474
+
2475
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2476
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2477
+
2478
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
2479
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
2480
+
2481
+ cl_kernel kernel = backend_ctx->kernel_relu;
2482
+
2483
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2484
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2485
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2486
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2487
+
2488
+ const int64_t n = ggml_nelements(dst);
2489
+
2490
+ size_t global_work_size[] = {(size_t)n, 1, 1};
2491
+ size_t local_work_size[] = {64, 1, 1};
2492
+
2493
+ #ifdef GGML_OPENCL_PROFILING
2494
+ cl_event evt;
2495
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2496
+
2497
+ g_profiling_info.emplace_back();
2498
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2499
+ #else
2500
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2501
+ #endif
2502
+ }
2503
+
2504
+ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2505
+ GGML_ASSERT(src0);
2506
+ GGML_ASSERT(src0->extra);
2507
+ GGML_ASSERT(dst);
2508
+ GGML_ASSERT(dst->extra);
2509
+
2510
+ UNUSED(src1);
2511
+
2512
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2513
+ cl_command_queue queue = backend_ctx->queue;
2514
+
2515
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2516
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2517
+
2518
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
2519
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
2520
+
2521
+ float min;
2522
+ float max;
2523
+ memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
2524
+ memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
2525
+
2526
+ cl_kernel kernel = backend_ctx->kernel_clamp;
2527
+
2528
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2529
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2530
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2531
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2532
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &min));
2533
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &max));
2534
+
2535
+ const int64_t n = ggml_nelements(dst);
2536
+
2537
+ size_t global_work_size[] = {(size_t)n, 1, 1};
2538
+ size_t local_work_size[] = {64, 1, 1};
2539
+
2540
+ #ifdef GGML_OPENCL_PROFILING
2541
+ cl_event evt;
2542
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2543
+
2544
+ g_profiling_info.emplace_back();
2545
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2546
+ #else
2547
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2548
+ #endif
2549
+ }
2550
+
2551
+ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2552
+ GGML_ASSERT(src0);
2553
+ GGML_ASSERT(src0->extra);
2554
+ GGML_ASSERT(dst);
2555
+ GGML_ASSERT(dst->extra);
2556
+
2557
+ UNUSED(src1);
2558
+
2559
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2560
+ cl_command_queue queue = backend_ctx->queue;
2561
+
2562
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2563
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2564
+
2565
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
2566
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
2567
+
2568
+ float eps;
2569
+ memcpy(&eps, dst->op_params, sizeof(float));
2570
+
2571
+ const int ne00 = src0 ? src0->ne[0] : 0;
2572
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2573
+
2574
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
2575
+
2576
+ const int nth = MIN(64, ne00);
2577
+
2578
+ cl_kernel kernel = backend_ctx->kernel_norm;
2579
+
2580
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2581
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2582
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2583
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2584
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2585
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
2586
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
2587
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL));
2588
+
2589
+ const int64_t nrows = ggml_nrows(src0);
2590
+
2591
+ size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
2592
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
2593
+
2594
+ #ifdef GGML_OPENCL_PROFILING
2595
+ cl_event evt;
2596
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2597
+
2598
+ g_profiling_info.emplace_back();
2599
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2600
+ #else
2601
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2602
+ #endif
2603
+ }
2604
+
2605
+ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2606
+ GGML_ASSERT(src0);
2607
+ GGML_ASSERT(src0->extra);
2608
+ GGML_ASSERT(dst);
2609
+ GGML_ASSERT(dst->extra);
2610
+
2611
+ UNUSED(src1);
2612
+
2613
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2614
+ cl_command_queue queue = backend_ctx->queue;
2615
+
2616
+ ggml_backend_opencl_device_context * dev_ctx =
2617
+ (ggml_backend_opencl_device_context *)backend->device->context;
2618
+
2619
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2620
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2621
+
2622
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
2623
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
2624
+
2625
+ float eps;
2626
+ memcpy(&eps, dst->op_params, sizeof(float));
2627
+
2628
+ const int ne00 = src0 ? src0->ne[0] : 0;
2629
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2630
+
2631
+ GGML_ASSERT(ne00 % 4 == 0);
2632
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
2633
+
2634
+ const int nth = MIN(64, ne00);
2635
+
2636
+ const int64_t nrows = ggml_nrows(src0);
2637
+
2638
+ size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
2639
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
2640
+
2641
+ cl_kernel kernel = backend_ctx->kernel_rms_norm;
2642
+
2643
+ // Note, this kernel declares local memory in kernel args and the size
2644
+ // depends on subgroup size.
2645
+ // Retrieve subgroup size.
2646
+ // Note, this requires OpenCL 2.1 and above
2647
+ size_t sgs;
2648
+ CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
2649
+ CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
2650
+ sizeof(local_work_size), local_work_size,
2651
+ sizeof(size_t), &sgs, NULL));
2652
+
2653
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2654
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2655
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2656
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2657
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2658
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
2659
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
2660
+ // This is local memory - the size depends on subgroup size.
2661
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs, NULL));
2662
+
2663
+ #ifdef GGML_OPENCL_PROFILING
2664
+ cl_event evt;
2665
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
2666
+
2667
+ g_profiling_info.emplace_back();
2668
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2669
+ #else
2670
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
2671
+ #endif
2672
+ }
2673
+
2674
+ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2675
+ GGML_ASSERT(src0);
2676
+ GGML_ASSERT(src0->extra);
2677
+ GGML_ASSERT(src1);
2678
+ GGML_ASSERT(src1->extra);
2679
+ GGML_ASSERT(dst);
2680
+ GGML_ASSERT(dst->extra);
2681
+
2682
+ const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
2683
+ const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
2684
+
2685
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2686
+ cl_command_queue queue = backend_ctx->queue;
2687
+
2688
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2689
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
2690
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2691
+
2692
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
2693
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
2694
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
2695
+
2696
+ #ifdef GGML_OPENCL_SOA_Q
2697
+ ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
2698
+ #endif
2699
+
2700
+ const int ne00 = src0 ? src0->ne[0] : 0;
2701
+ const int ne01 = src0 ? src0->ne[1] : 0;
2702
+ const int ne02 = src0 ? src0->ne[2] : 0;
2703
+ const int ne03 = src0 ? src0->ne[3] : 0;
2704
+
2705
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
2706
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2707
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2708
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
2709
+
2710
+ const int ne10 = src1 ? src1->ne[0] : 0;
2711
+ const int ne11 = src1 ? src1->ne[1] : 0;
2712
+ const int ne12 = src1 ? src1->ne[2] : 0;
2713
+ const int ne13 = src1 ? src1->ne[3] : 0;
2714
+
2715
+ const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
2716
+ const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
2717
+ const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
2718
+ const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
2719
+
2720
+ const int ne0 = dst ? dst->ne[0] : 0;
2721
+ const int ne1 = dst ? dst->ne[1] : 0;
2722
+
2723
+ int r2 = ne12/ne02;
2724
+ int r3 = ne13/ne03;
2725
+
2726
+ GGML_ASSERT(ne00 == ne10);
2727
+
2728
+ int nth0 = 32;
2729
+ int nth1 = 1;
2730
+ int nrows = 1;
2731
+ // The number of values produced by each subgroup
2732
+ int ndst = 4;
2733
+
2734
+ cl_kernel kernel;
2735
+
2736
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
2737
+ cl_context context = backend_ctx->context;
2738
+
2739
+ if (ne01 && ne1 && use_adreno_kernels(src0)) {
2740
+
2741
+ // init CL objects
2742
+ // <--------------------------------------------> //
2743
+ cl_int status;
2744
+ cl_image_format img_fmt_1d;
2745
+ cl_image_desc img_desc_1d;
2746
+ cl_buffer_region region;
2747
+ cl_mem A_image1d;
2748
+ cl_mem B_image1d;
2749
+ cl_mem B_sub_buffer;
2750
+ cl_mem C_d;
2751
+ // for B transpose
2752
+ cl_mem B_d;
2753
+ cl_mem B_d_input_image;
2754
+ // <--------------------------------------------> //
2755
+
2756
+ // define matrix dimensions
2757
+ // <--------------------------------------------> //
2758
+ int M = ne01;
2759
+ int N = ne1;
2760
+ int K = ne00;
2761
+ int padding;
2762
+ // <--------------------------------------------> //
2763
+
2764
+ // q4_0 x fp32
2765
+ if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
2766
+ // TODO: remove duplicate definitions of image description + format -- move to top
2767
+
2768
+ // create an image for A
2769
+ // <--------------------------------------------> //
2770
+ if (N == 1) {
2771
+ img_fmt_1d = { CL_R, CL_UNSIGNED_INT32};
2772
+ } else {
2773
+ img_fmt_1d = { CL_R, CL_FLOAT};
2774
+ }
2775
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
2776
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
2777
+ img_desc_1d.image_width = M * K / 2 / 4; // Divide by 4 for char -> float
2778
+ img_desc_1d.buffer = extra0_q4_0->q;
2779
+ A_image1d = clCreateImage(
2780
+ context,
2781
+ CL_MEM_READ_ONLY,
2782
+ &img_fmt_1d,
2783
+ &img_desc_1d,
2784
+ NULL,
2785
+ &status);
2786
+ CL_CHECK(status);
2787
+ // <--------------------------------------------> //
2788
+
2789
+
2790
+ // create a sub_buffer for B
2791
+ // <--------------------------------------------> //
2792
+ region.origin = (extra1->offset);
2793
+ region.size = K * N * sizeof(float);
2794
+ B_sub_buffer = clCreateSubBuffer(
2795
+ extra1->data_device,
2796
+ 0,
2797
+ CL_BUFFER_CREATE_TYPE_REGION,
2798
+ &region,
2799
+ &status);
2800
+ CL_CHECK(status);
2801
+ // <--------------------------------------------> //
2802
+
2803
+ // transpose activation for Skyler's gemm
2804
+ if (N != 1) {
2805
+ //how many extra elements beyond multiple of 8
2806
+ int extra_elements = N % 8;
2807
+
2808
+ //how much padding to add
2809
+ padding = 0;
2810
+ if (extra_elements > 0){
2811
+ padding = 8 - extra_elements;
2812
+ }
2813
+
2814
+ // Specify the starting offset (in bytes)
2815
+ region.origin = 0;
2816
+ // Specify the size of the sub-buffer (divide by 2 for FP16)
2817
+ region.size = K * (N + padding) * sizeof(float)/2;
2818
+ B_d = clCreateSubBuffer(
2819
+ backend_ctx->B_d_max,
2820
+ 0,
2821
+ CL_BUFFER_CREATE_TYPE_REGION,
2822
+ &region,
2823
+ &status);
2824
+ CL_CHECK(status);
2825
+
2826
+ cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT };
2827
+ cl_image_desc image_desc_B_d_input = {
2828
+ CL_MEM_OBJECT_IMAGE1D_BUFFER,
2829
+ static_cast<size_t>(K * N / 4),
2830
+ 0, 0, 0, 0, 0, 0, 0, { B_sub_buffer }
2831
+ };
2832
+ B_d_input_image = clCreateImage(
2833
+ context,
2834
+ 0,
2835
+ &image_format_B_d_input,
2836
+ &image_desc_B_d_input,
2837
+ NULL,
2838
+ &status);
2839
+ CL_CHECK(status);
2840
+
2841
+ cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
2842
+ cl_image_desc image_desc_B_d_output = {
2843
+ CL_MEM_OBJECT_IMAGE1D_BUFFER,
2844
+ static_cast<size_t>(K * (N + padding)/4),
2845
+ 0, 0, 0, 0, 0, 0, 0, { B_d }
2846
+ };
2847
+ B_image1d = clCreateImage(
2848
+ context,
2849
+ 0,
2850
+ &image_format_B_d_output,
2851
+ &image_desc_B_d_output,
2852
+ NULL,
2853
+ &status);
2854
+ CL_CHECK(status);
2855
+
2856
+ int height_B = N/4;
2857
+ int width_B = K/4;
2858
+ int padded_height_B = (N + padding)/4;
2859
+
2860
+ kernel = backend_ctx->kernel_transpose_32_16;
2861
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image));
2862
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d));
2863
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
2864
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
2865
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
2866
+
2867
+ size_t local_size_t[2] = { 1, 16 };
2868
+ //WGS tuning
2869
+ if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
2870
+ local_size_t[0]=4;
2871
+ local_size_t[1]=8;
2872
+ } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
2873
+ local_size_t[0]=2;
2874
+ local_size_t[1]=8;
2875
+ } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
2876
+ local_size_t[0]=1;
2877
+ local_size_t[1]=8;
2878
+ } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
2879
+ local_size_t[0]=2;
2880
+ local_size_t[1]=8;
2881
+ }
2882
+
2883
+ size_t global_size_t[2] = {
2884
+ static_cast<size_t>(width_B),
2885
+ static_cast<size_t>(padded_height_B)
2886
+ };
2887
+
2888
+ #ifdef GGML_OPENCL_PROFILING
2889
+ cl_event evt;
2890
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
2891
+
2892
+ g_profiling_info.emplace_back();
2893
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
2894
+ #else
2895
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
2896
+ #endif
2897
+ } else {
2898
+ // no need to transpose B in other cases
2899
+ // create an image for B from sub_buffer
2900
+ // <--------------------------------------------> //
2901
+ img_fmt_1d = {CL_RGBA, CL_FLOAT};
2902
+
2903
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
2904
+ img_desc_1d.image_width = K * N / 4;
2905
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
2906
+ img_desc_1d.buffer = B_sub_buffer;
2907
+ B_image1d = clCreateImage(
2908
+ context,
2909
+ CL_MEM_READ_ONLY,
2910
+ &img_fmt_1d,
2911
+ &img_desc_1d,
2912
+ NULL,
2913
+ &status);
2914
+ CL_CHECK(status);
2915
+ // <--------------------------------------------> //
2916
+ }
2917
+
2918
+ // choose gemm or gemv kernel
2919
+ // <--------------------------------------------> //
2920
+ if (N == 1) {
2921
+ kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
2922
+ if (M == 4096 && K == 4096) {
2923
+ kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
2924
+ } else if (M == 4096 && K == 11008) {
2925
+ kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
2926
+ } else if (M == 11008 && K == 4096) {
2927
+ kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
2928
+ } else if (M == 32000 && K == 4096) {
2929
+ kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
2930
+ }
2931
+ } else {
2932
+ kernel = backend_ctx->CL_mul_mat_Ab_Bi_8x4;
2933
+ }
2934
+ // <--------------------------------------------> //
2935
+
2936
+ // set kernel args
2937
+ // <--------------------------------------------> //
2938
+ cl_uint k_arg = 0;
2939
+
2940
+ if (N == 1) {
2941
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
2942
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q4_0->d));
2943
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
2944
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
2945
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
2946
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
2947
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
2948
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
2949
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
2950
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
2951
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
2952
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
2953
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
2954
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
2955
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
2956
+ } else {
2957
+ region.origin = extrad->offset; // Specify the starting offset (in bytes)
2958
+ region.size = M * N * sizeof(float); // Specify the size of the sub-buffer
2959
+ C_d = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
2960
+ CL_CHECK(status);
2961
+
2962
+ int padded_N = ne1 + padding;
2963
+
2964
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); //A_q_dextra0_q4_0->q
2965
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); //A_s_d
2966
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d)); //B_d
2967
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &C_d)); //C_d
2968
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01)); //M
2969
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &padded_N)); //N with padding
2970
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); //K
2971
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne1)); //N without padding
2972
+ }
2973
+ // <--------------------------------------------> //
2974
+
2975
+ // choose workgroup size
2976
+ // <--------------------------------------------> //
2977
+ size_t global_work_size[3] = {
2978
+ 64, static_cast<size_t>((M+63)/64), static_cast<size_t>((N+31)/32)};
2979
+ size_t local_work_size[3] = {64, 2, 4};
2980
+
2981
+ global_work_size[0] = (size_t)(ceil((float)ne1/8));
2982
+ global_work_size[1] = (size_t)(ne01/4);
2983
+ global_work_size[2] = (size_t)(1);
2984
+
2985
+ local_work_size[0] = (size_t)(1); //4x32 for FP32
2986
+ local_work_size[1] = (size_t)(128);
2987
+ local_work_size[2] = (size_t)(1);
2988
+
2989
+ //WGS tuning
2990
+ if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
2991
+ local_work_size[0] = 1;
2992
+ local_work_size[1] = 128;
2993
+ } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
2994
+ local_work_size[0] = 2;
2995
+ local_work_size[1] = 64;
2996
+ } else if (ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
2997
+ local_work_size[0] = 2;
2998
+ local_work_size[1] = 64;
2999
+ } else if (ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
3000
+ local_work_size[0] = 2;
3001
+ local_work_size[1] = 64;
3002
+ }
3003
+
3004
+ if (N == 1) {
3005
+ local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
3006
+ local_work_size[1] = 4; // reduce factor
3007
+ local_work_size[2] = 1;
3008
+
3009
+ global_work_size[0] = M / 2;
3010
+ global_work_size[1] = 4; // reduce factor
3011
+ global_work_size[2] = 1;
3012
+ }
3013
+ // <--------------------------------------------> //
3014
+
3015
+ // enqueue kernel with profiling
3016
+ // <--------------------------------------------> //
3017
+ #ifdef GGML_OPENCL_PROFILING
3018
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3019
+
3020
+ g_profiling_info.emplace_back();
3021
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3022
+ // enqueue kernel without profiling
3023
+ #else
3024
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3025
+ #endif
3026
+ // <--------------------------------------------> //
3027
+
3028
+ // deallocate sub buffers and images
3029
+ // <--------------------------------------------> //
3030
+ CL_CHECK(clReleaseMemObject(A_image1d));
3031
+ CL_CHECK(clReleaseMemObject(B_sub_buffer));
3032
+ CL_CHECK(clReleaseMemObject(B_image1d));
3033
+
3034
+ if (N != 1) {
3035
+ CL_CHECK(clReleaseMemObject(B_d));
3036
+ CL_CHECK(clReleaseMemObject(B_d_input_image));
3037
+ CL_CHECK(clReleaseMemObject(C_d));
3038
+ }
3039
+ // <--------------------------------------------> //
3040
+
3041
+ return;
3042
+ }
3043
+ } // if (ne01 && ne1)
3044
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
3045
+
3046
+ if (!ggml_is_transposed(src0) &&
3047
+ !ggml_is_transposed(src1) &&
3048
+ src1t == GGML_TYPE_F32 &&
3049
+ ne00%32 == 0 &&
3050
+ ne11 > 2) {
3051
+ #ifdef GGML_OPENCL_SOA_Q
3052
+ // Set up kernel.
3053
+ switch(src0t) {
3054
+ case GGML_TYPE_Q4_0:
3055
+ // This should have been satisfied.
3056
+ GGML_ASSERT(ne11 == ne1);
3057
+ GGML_ASSERT(ne01 == ne0);
3058
+
3059
+ if (backend_ctx->gpu_family == INTEL) {
3060
+ nth0 = 16;
3061
+ nth1 = 1;
3062
+
3063
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat;
3064
+ } else if (backend_ctx->gpu_family == ADRENO) {
3065
+ nth0 = 64;
3066
+ nth1 = 1;
3067
+
3068
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat;
3069
+ } else {
3070
+ GGML_ASSERT(false && "TODO: Unknown GPU");
3071
+ }
3072
+
3073
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
3074
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
3075
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3076
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3077
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3078
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3079
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
3080
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
3081
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
3082
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
3083
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
3084
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
3085
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
3086
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
3087
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
3088
+ break;
3089
+ default:
3090
+ break;
3091
+ }
3092
+
3093
+ // Launch kernel.
3094
+ if (src0t == GGML_TYPE_Q4_0) {
3095
+ size_t global_work_size[] = {(size_t)(ne01 + 7)/8*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
3096
+ size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
3097
+
3098
+ if (backend_ctx->gpu_family == INTEL) {
3099
+ // Set global size for Intel. It uses 16x output values.
3100
+ global_work_size[0] = (size_t)(ne01 + 15)/16*nth0;
3101
+ global_work_size[1] = (size_t)ne11*nth1;
3102
+ global_work_size[2] = (size_t)ne12*ne13;
3103
+ }
3104
+
3105
+ #ifdef GGML_OPENCL_PROFILING
3106
+ cl_event evt;
3107
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3108
+
3109
+ g_profiling_info.emplace_back();
3110
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3111
+ #else
3112
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3113
+ #endif
3114
+ return;
3115
+ }
3116
+ #else // GGML_OPENCL_SOA_Q
3117
+ // TODO: add block_q4_0 variant.
3118
+ #endif // GGML_OPENCL_SOA_Q
3119
+ }
3120
+
3121
+ // use custom matrix x vector kernel
3122
+ switch (src0t) {
3123
+ case GGML_TYPE_F32:
3124
+ //GGML_ASSERT(ne02 == ne12);
3125
+ GGML_ASSERT(src1t == GGML_TYPE_F32);
3126
+ kernel = backend_ctx->kernel_mul_mat_f32_f32;
3127
+ nrows = 4;
3128
+
3129
+ if (backend_ctx->gpu_family == INTEL) {
3130
+ nth0 = 32;
3131
+ nth1 = 1;
3132
+ } else if (backend_ctx->gpu_family == ADRENO) {
3133
+ nth0 = 64;
3134
+ nth1 = 1;
3135
+ } else {
3136
+ GGML_ASSERT(false && "TODO: Unknown GPU");
3137
+ }
3138
+
3139
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3140
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3141
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3142
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3143
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3144
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3145
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
3146
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
3147
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
3148
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
3149
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
3150
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
3151
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
3152
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
3153
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
3154
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
3155
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
3156
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
3157
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
3158
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
3159
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
3160
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
3161
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
3162
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
3163
+ break;
3164
+ case GGML_TYPE_F16:
3165
+ //GGML_ASSERT(ne02 == ne12);
3166
+ if (backend_ctx->gpu_family == INTEL) {
3167
+ nth0 = 32;
3168
+ nth1 = 1;
3169
+ } else if (backend_ctx->gpu_family == ADRENO) {
3170
+ nth0 = 64;
3171
+ nth1 = 1;
3172
+ } else {
3173
+ GGML_ASSERT(false && "TODO: Unknown GPU");
3174
+ }
3175
+
3176
+ if (src1t == GGML_TYPE_F32) {
3177
+ if (ne11 * ne12 < 4) {
3178
+ kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
3179
+ } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
3180
+ kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
3181
+ nrows = ne11;
3182
+ } else {
3183
+ kernel = backend_ctx->kernel_mul_mat_f16_f32;
3184
+ nrows = 4;
3185
+ }
3186
+ } else {
3187
+ kernel = backend_ctx->kernel_mul_mat_f16_f16;
3188
+ nrows = 4;
3189
+ }
3190
+
3191
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3192
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3193
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3194
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3195
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3196
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3197
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
3198
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
3199
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
3200
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
3201
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
3202
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
3203
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
3204
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
3205
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
3206
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
3207
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
3208
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
3209
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
3210
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
3211
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
3212
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
3213
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
3214
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
3215
+ break;
3216
+ case GGML_TYPE_Q4_0:
3217
+ // This should have been satisfied.
3218
+ GGML_ASSERT(ne11 == ne1);
3219
+ GGML_ASSERT(ne01 == ne0);
3220
+
3221
+ #ifdef GGML_OPENCL_SOA_Q
3222
+ if (backend_ctx->gpu_family == INTEL) {
3223
+ nth0 = 16;
3224
+ nth1 = 1;
3225
+
3226
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
3227
+ ndst = 8;
3228
+ } else if (backend_ctx->gpu_family == ADRENO) {
3229
+ nth0 = 64;
3230
+ nth1 = 1;
3231
+
3232
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
3233
+ ndst =8;
3234
+ } else {
3235
+ GGML_ASSERT(false && "TODO: Unknown GPU");
3236
+ }
3237
+
3238
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
3239
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
3240
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3241
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3242
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3243
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3244
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
3245
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
3246
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
3247
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
3248
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
3249
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
3250
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
3251
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
3252
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
3253
+ #else // GGML_OPENCL_SOA_Q
3254
+ if (backend_ctx->gpu_family == INTEL) {
3255
+ // Use 1D local size. Each workgroup is a SIMD group. Each SIMD
3256
+ // group produces N_DST (4 for Q4_0 kernel) values in the result.
3257
+ // The number of workgroups on dim 0 (the leading dimension) is
3258
+ // the nearest multiple of 4 that covers ne0 (equals ne01).
3259
+ nth0 = 16;
3260
+ nth1 = 1;
3261
+
3262
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32;
3263
+ ndst = 4;
3264
+ } else if (backend_ctx->gpu_family == ADRENO) {
3265
+ nth0 = 64;
3266
+ nth1 = 1;
3267
+
3268
+ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_v;
3269
+ ndst = 4;
3270
+ } else {
3271
+ GGML_ASSERT(false && "TODO: Unknown GPU");
3272
+ }
3273
+
3274
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3275
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3276
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3277
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3278
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3279
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3280
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
3281
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
3282
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
3283
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
3284
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
3285
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
3286
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
3287
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
3288
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
3289
+ #endif // GGML_OPENCL_SOA_Q
3290
+ break;
3291
+ case GGML_TYPE_Q4_1:
3292
+ case GGML_TYPE_Q8_0:
3293
+ case GGML_TYPE_Q2_K:
3294
+ case GGML_TYPE_Q3_K:
3295
+ case GGML_TYPE_Q4_K:
3296
+ case GGML_TYPE_Q5_K:
3297
+ case GGML_TYPE_Q6_K:
3298
+ kernel = backend_ctx->kernel_mul_mv_q6_K_f32;
3299
+
3300
+ if (backend_ctx->gpu_family == INTEL) {
3301
+ nth0 = 2;
3302
+ nth1 = 16;
3303
+ } else if (backend_ctx->gpu_family == ADRENO) {
3304
+ nth0 = 2;
3305
+ nth1 = 64;
3306
+ } else {
3307
+ GGML_ASSERT(false && "TODO: Unknown GPU");
3308
+ }
3309
+
3310
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3311
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3312
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3313
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3314
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3315
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3316
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
3317
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
3318
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
3319
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
3320
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
3321
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
3322
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
3323
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
3324
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
3325
+ break;
3326
+ default:
3327
+ GGML_ASSERT(false && "not implemented");
3328
+ }
3329
+
3330
+ if (src0t == GGML_TYPE_Q4_0 ||
3331
+ src0t == GGML_TYPE_Q4_1 ||
3332
+ src0t == GGML_TYPE_Q8_0 ||
3333
+ src0t == GGML_TYPE_Q2_K) {
3334
+ // Each SIMD group produces N_DST values in the result. Assuming each
3335
+ // workgroup has N_SIMDGROUP SIMD groups, then each workgroup will
3336
+ // produce N_DST*N_SIMDGROUP values in the result. Hence, the grid size
3337
+ // (number of workgroups) will be a nearest multiple of
3338
+ // N_DST*N_SIMDGROUP to cover the size of the dimension. Below, 4 is
3339
+ // N_DST*N_SIMDGROUP (see the kernel for Q4_0 matmul).
3340
+ size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
3341
+ size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
3342
+
3343
+ #ifdef GGML_OPENCL_PROFILING
3344
+ cl_event evt;
3345
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3346
+
3347
+ g_profiling_info.emplace_back();
3348
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3349
+ #else
3350
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3351
+ #endif
3352
+ } else if (src0t == GGML_TYPE_Q4_K) {
3353
+ GGML_ASSERT(false && "not implemented");
3354
+ } else if (src0t == GGML_TYPE_Q3_K) {
3355
+ GGML_ASSERT(false && "not implemented");
3356
+ } else if (src0t == GGML_TYPE_Q5_K) {
3357
+ GGML_ASSERT(false && "not implemented");
3358
+ } else if (src0t == GGML_TYPE_Q6_K) {
3359
+ size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
3360
+ size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
3361
+
3362
+ #ifdef GGML_OPENCL_PROFILING
3363
+ cl_event evt;
3364
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3365
+
3366
+ g_profiling_info.emplace_back();
3367
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3368
+ #else
3369
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3370
+ #endif
3371
+ } else {
3372
+ int64_t ny = (ne11 + nrows - 1)/nrows;
3373
+
3374
+ size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
3375
+ size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
3376
+
3377
+ #ifdef GGML_OPENCL_PROFILING
3378
+ cl_event evt;
3379
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3380
+
3381
+ g_profiling_info.emplace_back();
3382
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3383
+ #else
3384
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3385
+ #endif
3386
+ }
3387
+ }
3388
+
3389
+ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3390
+ GGML_ASSERT(src0);
3391
+ GGML_ASSERT(src0->extra);
3392
+ GGML_ASSERT(dst);
3393
+ GGML_ASSERT(dst->extra);
3394
+ GGML_UNUSED(src1);
3395
+
3396
+ GGML_ASSERT(ggml_is_contiguous(src0));
3397
+
3398
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3399
+ cl_command_queue queue = backend_ctx->queue;
3400
+
3401
+ float scale;
3402
+ memcpy(&scale, dst->op_params, sizeof(scale));
3403
+
3404
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3405
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3406
+
3407
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3408
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3409
+
3410
+ cl_kernel kernel = backend_ctx->kernel_scale;
3411
+
3412
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3413
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3414
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
3415
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
3416
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
3417
+
3418
+ int n = ggml_nelements(dst)/4;
3419
+
3420
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3421
+ size_t local_work_size[] = {64, 1, 1};
3422
+
3423
+ #ifdef GGML_OPENCL_PROFILING
3424
+ cl_event evt;
3425
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3426
+
3427
+ g_profiling_info.emplace_back();
3428
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3429
+ #else
3430
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3431
+ #endif
3432
+ }
3433
+
3434
+ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3435
+ GGML_ASSERT(src0);
3436
+ GGML_ASSERT(src0->extra);
3437
+ GGML_ASSERT(src1);
3438
+ GGML_ASSERT(src1->extra);
3439
+
3440
+ // GGML_OP_CPY happens between src0 and src1.
3441
+ // GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
3442
+ UNUSED(dst);
3443
+
3444
+ const int ne00 = src0 ? src0->ne[0] : 0;
3445
+ const int ne01 = src0 ? src0->ne[1] : 0;
3446
+ const int ne02 = src0 ? src0->ne[2] : 0;
3447
+ const int ne03 = src0 ? src0->ne[3] : 0;
3448
+
3449
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
3450
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
3451
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
3452
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
3453
+
3454
+ const int ne10 = src1 ? src1->ne[0] : 0;
3455
+ const int ne11 = src1 ? src1->ne[1] : 0;
3456
+ const int ne12 = src1 ? src1->ne[2] : 0;
3457
+ const int ne13 = src1 ? src1->ne[3] : 0;
3458
+
3459
+ const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
3460
+ const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
3461
+ const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
3462
+ const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
3463
+
3464
+ const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
3465
+ const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
3466
+
3467
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3468
+ cl_command_queue queue = backend_ctx->queue;
3469
+
3470
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3471
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3472
+
3473
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3474
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
3475
+
3476
+ cl_kernel kernel;
3477
+
3478
+ switch (src0t) {
3479
+ case GGML_TYPE_F32:
3480
+ switch (src1t) {
3481
+ case GGML_TYPE_F16:
3482
+ kernel = backend_ctx->kernel_cpy_f32_f16;
3483
+ break;
3484
+ case GGML_TYPE_F32:
3485
+ kernel = backend_ctx->kernel_cpy_f32_f32;
3486
+ break;
3487
+ default:
3488
+ GGML_ASSERT(false && "not implemented");
3489
+ }
3490
+ break;
3491
+ case GGML_TYPE_F16:
3492
+ switch (src1t) {
3493
+ case GGML_TYPE_F16:
3494
+ kernel = backend_ctx->kernel_cpy_f16_f16;
3495
+ break;
3496
+ case GGML_TYPE_F32:
3497
+ kernel = backend_ctx->kernel_cpy_f16_f32;
3498
+ break;
3499
+ default:
3500
+ GGML_ASSERT(false && "not implemented");
3501
+ }
3502
+ break;
3503
+ default:
3504
+ GGML_ASSERT(false && "not implemented");
3505
+ }
3506
+
3507
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3508
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3509
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3510
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3511
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
3512
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
3513
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
3514
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
3515
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
3516
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
3517
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
3518
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
3519
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
3520
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
3521
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
3522
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
3523
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
3524
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
3525
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
3526
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
3527
+
3528
+ const int nth = MIN(64, ne00);
3529
+
3530
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
3531
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
3532
+
3533
+ #ifdef GGML_OPENCL_PROFILING
3534
+ cl_event evt;
3535
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3536
+
3537
+ g_profiling_info.emplace_back();
3538
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
3539
+ #else
3540
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3541
+ #endif
3542
+ }
3543
+
3544
+ static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3545
+ ggml_cl_cpy(backend, src0, dst, nullptr);
3546
+ UNUSED(src1);
3547
+ }
3548
+
3549
+ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3550
+ GGML_ASSERT(src0);
3551
+ GGML_ASSERT(src0->extra);
3552
+ GGML_ASSERT(dst);
3553
+ GGML_ASSERT(dst->extra);
3554
+
3555
+ UNUSED(src1);
3556
+
3557
+ int n_past = ((int32_t *)(dst->op_params))[0];
3558
+
3559
+ const int ne00 = src0 ? src0->ne[0] : 0;
3560
+ const int ne01 = src0 ? src0->ne[1] : 0;
3561
+ const int ne02 = src0 ? src0->ne[2] : 0;
3562
+
3563
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3564
+ cl_command_queue queue = backend_ctx->queue;
3565
+
3566
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3567
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3568
+
3569
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3570
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3571
+
3572
+ cl_kernel kernel;
3573
+
3574
+ if (ne00%8 == 0) {
3575
+ kernel = backend_ctx->kernel_diag_mask_inf_8;
3576
+
3577
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3578
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3579
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
3580
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
3581
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
3582
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
3583
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past));
3584
+
3585
+ size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
3586
+ size_t local_work_size[] = {64, 1, 1};
3587
+
3588
+ #ifdef GGML_OPENCL_PROFILING
3589
+ cl_event evt;
3590
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3591
+
3592
+ g_profiling_info.emplace_back();
3593
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3594
+ #else
3595
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3596
+ #endif
3597
+ } else {
3598
+ kernel = backend_ctx->kernel_diag_mask_inf;
3599
+
3600
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3601
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3602
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
3603
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
3604
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
3605
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
3606
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past));
3607
+
3608
+ size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
3609
+ size_t local_work_size[] = {64, 1, 1};
3610
+
3611
+ #ifdef GGML_OPENCL_PROFILING
3612
+ cl_event evt;
3613
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3614
+
3615
+ g_profiling_info.emplace_back();
3616
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3617
+ #else
3618
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3619
+ #endif
3620
+ }
3621
+ }
3622
+
3623
+ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3624
+ GGML_ASSERT(src0);
3625
+ GGML_ASSERT(src0->extra);
3626
+ GGML_ASSERT(dst);
3627
+ GGML_ASSERT(dst->extra);
3628
+
3629
+ // Softmax can now fuse KQ mask and KQ scale, which used to be two additional
3630
+ // ops before softmax. It now also fuses alibi if `max_bias > 0`. For llama,
3631
+ // alibi is not used; however, for some other models, it is used.
3632
+ // KQ_mask
3633
+ if (src1) {
3634
+ GGML_ASSERT(src1);
3635
+ GGML_ASSERT(src1->extra);
3636
+ }
3637
+
3638
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3639
+ cl_command_queue queue = backend_ctx->queue;
3640
+
3641
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3642
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3643
+
3644
+ ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
3645
+
3646
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3647
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3648
+
3649
+ cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
3650
+
3651
+ const int ne00 = src0 ? src0->ne[0] : 0;
3652
+ const int ne01 = src0 ? src0->ne[1] : 0;
3653
+ const int ne02 = src0 ? src0->ne[2] : 0;
3654
+ const int ne03 = src0 ? src0->ne[3] : 0;
3655
+
3656
+ float scale, max_bias;
3657
+ memcpy(&scale, dst->op_params + 0, sizeof(float));
3658
+ memcpy(&max_bias, dst->op_params + 1, sizeof(float));
3659
+
3660
+ const int nrows_x = ggml_nrows(src0);
3661
+ const int nrows_y = src0->ne[1];
3662
+
3663
+ const int n_head = nrows_x/nrows_y;
3664
+ const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
3665
+
3666
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
3667
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
3668
+
3669
+ // Local size must be wave size. Each workgroup is a wave, working on a row,
3670
+ // where a row corresponds to leading dimension.
3671
+ int nth = MIN(32, ne00);
3672
+
3673
+ if (backend_ctx->gpu_family == INTEL) {
3674
+ // This is the same as the initial value.
3675
+ nth = MIN(32, ne00);
3676
+ }
3677
+ else if (backend_ctx->gpu_family == ADRENO) {
3678
+ nth = 64;
3679
+ } else {
3680
+ GGML_ASSERT(false && "TODO: Unknown GPU");
3681
+ }
3682
+
3683
+ cl_kernel kernel;
3684
+
3685
+ if (ne00%4 == 0) {
3686
+ kernel = backend_ctx->kernel_soft_max_4;
3687
+ } else {
3688
+ kernel = backend_ctx->kernel_soft_max;
3689
+ }
3690
+
3691
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3692
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3693
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), extra1 ? &extra1->data_device : &extra0->data_device));
3694
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3695
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3696
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3697
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
3698
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
3699
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
3700
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(float), &scale));
3701
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &max_bias));
3702
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &m0));
3703
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &m1));
3704
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &n_head_log2));
3705
+
3706
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
3707
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
3708
+
3709
+ #ifdef GGML_OPENCL_PROFILING
3710
+ cl_event evt;
3711
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3712
+
3713
+ g_profiling_info.emplace_back();
3714
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3715
+ #else
3716
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3717
+ #endif
3718
+ }
3719
+
3720
+ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3721
+ GGML_ASSERT(src0);
3722
+ GGML_ASSERT(src0->extra);
3723
+ GGML_ASSERT(src1);
3724
+ GGML_ASSERT(src1->extra);
3725
+ GGML_ASSERT(dst);
3726
+ GGML_ASSERT(dst->extra);
3727
+
3728
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3729
+ cl_command_queue queue = backend_ctx->queue;
3730
+
3731
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3732
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3733
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3734
+
3735
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3736
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
3737
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3738
+
3739
+ ggml_tensor * src2 = dst->src[2];
3740
+ ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
3741
+
3742
+ cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
3743
+
3744
+ const int ne00 = src0 ? src0->ne[0] : 0;
3745
+ const int ne01 = src0 ? src0->ne[1] : 0;
3746
+ const int ne02 = src0 ? src0->ne[2] : 0;
3747
+ const int ne03 = src0 ? src0->ne[3] : 0;
3748
+
3749
+ const int nb00 = src0 ? src0->nb[0] : 0;
3750
+ const int nb01 = src0 ? src0->nb[1] : 0;
3751
+ const int nb02 = src0 ? src0->nb[2] : 0;
3752
+ const int nb03 = src0 ? src0->nb[3] : 0;
3753
+
3754
+ const int ne10 = src1 ? src1->ne[0] : 0;
3755
+ const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
3756
+ const int ne12 = src1 ? src1->ne[2] : 0; UNUSED(ne12);
3757
+ const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
3758
+
3759
+ const int ne0 = dst ? dst->ne[0] : 0;
3760
+ const int ne1 = dst ? dst->ne[1] : 0;
3761
+ const int ne2 = dst ? dst->ne[2] : 0;
3762
+ const int ne3 = dst ? dst->ne[3] : 0;
3763
+
3764
+ const int nb0 = dst ? dst->nb[0] : 0;
3765
+ const int nb1 = dst ? dst->nb[1] : 0;
3766
+ const int nb2 = dst ? dst->nb[2] : 0;
3767
+ const int nb3 = dst ? dst->nb[3] : 0;
3768
+
3769
+ GGML_ASSERT(ne10 == ne02);
3770
+
3771
+ int nth = MIN(64, ne00);
3772
+
3773
+ const int n_past = ((int *) dst->op_params)[0];
3774
+ const int n_dims = ((int *) dst->op_params)[1];
3775
+ const int mode = ((int *) dst->op_params)[2];
3776
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
3777
+
3778
+ float freq_base;
3779
+ float freq_scale;
3780
+ float ext_factor;
3781
+ float attn_factor;
3782
+ float beta_fast;
3783
+ float beta_slow;
3784
+
3785
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
3786
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
3787
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
3788
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
3789
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
3790
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
3791
+
3792
+ const bool is_neox = mode & 2;
3793
+
3794
+ cl_kernel kernel;
3795
+
3796
+ if (!is_neox) {
3797
+ switch (src0->type) {
3798
+ case GGML_TYPE_F32:
3799
+ kernel = backend_ctx->kernel_rope_norm_f32;
3800
+ break;
3801
+ case GGML_TYPE_F16:
3802
+ kernel = backend_ctx->kernel_rope_norm_f16;
3803
+ break;
3804
+ default:
3805
+ GGML_ASSERT(false);
3806
+ };
3807
+ } else {
3808
+ switch (src0->type) {
3809
+ case GGML_TYPE_F32:
3810
+ kernel = backend_ctx->kernel_rope_neox_f32;
3811
+ break;
3812
+ case GGML_TYPE_F16:
3813
+ kernel = backend_ctx->kernel_rope_neox_f16;
3814
+ break;
3815
+ default:
3816
+ GGML_ASSERT(false);
3817
+ };
3818
+ }
3819
+
3820
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3821
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3822
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3823
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3824
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device));
3825
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
3826
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
3827
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
3828
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
3829
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
3830
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
3831
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne03));
3832
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb00));
3833
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb01));
3834
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb02));
3835
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb03));
3836
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne0));
3837
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne1));
3838
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne2));
3839
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne3));
3840
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb0));
3841
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb1));
3842
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb2));
3843
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &nb3));
3844
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_past));
3845
+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &n_dims));
3846
+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &n_ctx_orig));
3847
+ CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float), &freq_base));
3848
+ CL_CHECK(clSetKernelArg(kernel, 28, sizeof(float), &freq_scale));
3849
+ CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float), &ext_factor));
3850
+ CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor));
3851
+ CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast));
3852
+ CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow));
3853
+
3854
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
3855
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
3856
+
3857
+ #ifdef GGML_OPENCL_PROFILING
3858
+ cl_event evt;
3859
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3860
+
3861
+ g_profiling_info.emplace_back();
3862
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3863
+ #else
3864
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3865
+ #endif
3866
+ }
3867
+
3868
+ //------------------------------------------------------------------------------
3869
+ // Op offloading
3870
+ //------------------------------------------------------------------------------
3871
+
3872
+ typedef void (*ggml_cl_func_t)(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
3873
+
3874
+ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
3875
+ ggml_cl_func_t func = nullptr;
3876
+
3877
+ ggml_tensor * src0 = tensor->src[0];
3878
+ ggml_tensor * src1 = tensor->src[1];
3879
+
3880
+ const bool any_on_device = tensor->extra
3881
+ || (src0 != nullptr && src0->extra)
3882
+ || (src1 != nullptr && src1->extra);
3883
+
3884
+ switch (tensor->op) {
3885
+ case GGML_OP_GET_ROWS:
3886
+ if (!any_on_device) {
3887
+ return false;
3888
+ }
3889
+ func = ggml_cl_get_rows;
3890
+ break;
3891
+ case GGML_OP_CPY:
3892
+ if (!any_on_device) {
3893
+ return false;
3894
+ }
3895
+ func = ggml_cl_cpy;
3896
+ break;
3897
+ case GGML_OP_DUP:
3898
+ case GGML_OP_CONT:
3899
+ if (!any_on_device) {
3900
+ return false;
3901
+ }
3902
+ func = ggml_cl_dup;
3903
+ break;
3904
+ case GGML_OP_ADD:
3905
+ if (!any_on_device) {
3906
+ return false;
3907
+ }
3908
+ GGML_ASSERT(ggml_is_contiguous(src0));
3909
+ GGML_ASSERT(ggml_is_contiguous(src1));
3910
+ func = ggml_cl_add;
3911
+ break;
3912
+ case GGML_OP_MUL:
3913
+ if (!any_on_device) {
3914
+ return false;
3915
+ }
3916
+ func = ggml_cl_mul;
3917
+ break;
3918
+ case GGML_OP_UNARY:
3919
+ switch (ggml_get_unary_op(tensor)) {
3920
+ case GGML_UNARY_OP_GELU:
3921
+ if (!any_on_device) {
3922
+ return false;
3923
+ }
3924
+ func = ggml_cl_gelu;
3925
+ break;
3926
+ case GGML_UNARY_OP_SILU:
3927
+ if (!any_on_device) {
3928
+ return false;
3929
+ }
3930
+ func = ggml_cl_silu;
3931
+ break;
3932
+ case GGML_UNARY_OP_RELU:
3933
+ if (!any_on_device) {
3934
+ return false;
3935
+ }
3936
+ func = ggml_cl_relu;
3937
+ break;
3938
+ default:
3939
+ return false;
3940
+ } break;
3941
+ case GGML_OP_CLAMP:
3942
+ if (!any_on_device) {
3943
+ return false;
3944
+ }
3945
+ func = ggml_cl_clamp;
3946
+ break;
3947
+ case GGML_OP_NORM:
3948
+ if (!any_on_device) {
3949
+ return false;
3950
+ }
3951
+ func = ggml_cl_norm;
3952
+ break;
3953
+ case GGML_OP_RMS_NORM:
3954
+ if (!any_on_device) {
3955
+ return false;
3956
+ }
3957
+ func = ggml_cl_rms_norm;
3958
+ break;
3959
+ case GGML_OP_MUL_MAT:
3960
+ if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
3961
+ return false;
3962
+ }
3963
+ func = ggml_cl_mul_mat;
3964
+ break;
3965
+ case GGML_OP_SCALE:
3966
+ if (!any_on_device) {
3967
+ return false;
3968
+ }
3969
+ func = ggml_cl_scale;
3970
+ break;
3971
+ case GGML_OP_RESHAPE:
3972
+ case GGML_OP_VIEW:
3973
+ case GGML_OP_PERMUTE:
3974
+ case GGML_OP_TRANSPOSE:
3975
+ if (!any_on_device) {
3976
+ return false;
3977
+ }
3978
+ func = ggml_cl_nop;
3979
+ break;
3980
+ case GGML_OP_DIAG_MASK_INF:
3981
+ if (!any_on_device) {
3982
+ return false;
3983
+ }
3984
+ func = ggml_cl_diag_mask_inf;
3985
+ break;
3986
+ case GGML_OP_SOFT_MAX:
3987
+ if (!any_on_device) {
3988
+ return false;
3989
+ }
3990
+ func = ggml_cl_soft_max;
3991
+ break;
3992
+ case GGML_OP_ROPE:
3993
+ if (!any_on_device) {
3994
+ return false;
3995
+ }
3996
+ func = ggml_cl_rope;
3997
+ break;
3998
+ default:
3999
+ return false;
4000
+ }
4001
+
4002
+ func(backend, tensor->src[0], tensor->src[1], tensor);
4003
+ return true;
4004
+ }