@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -64,11 +64,33 @@ enum ADRENO_GPU_GEN {
64
64
  X1E,
65
65
  };
66
66
 
67
+ enum ADRENO_CL_COMPILER_TYPE {
68
+ E031,
69
+ DX,
70
+ };
71
+
67
72
  struct ggml_cl_version {
68
73
  cl_uint major = 0;
69
74
  cl_uint minor = 0;
70
75
  };
71
76
 
77
+ struct ggml_cl_compiler_version {
78
+ ADRENO_CL_COMPILER_TYPE type;
79
+ int major = -1;
80
+ int minor = -1;
81
+ int patch = -1;
82
+
83
+ bool same(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
84
+ return major == x && minor == y && patch == z && type == t;
85
+ }
86
+ bool newer_than(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
87
+ return major*10000 + minor*100 + patch > x*10000 + y*100 + z && type == t;
88
+ }
89
+ bool newer_than_or_same(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
90
+ return same(t, x, y, z) || newer_than(t, x, y, z);
91
+ }
92
+ };
93
+
72
94
  // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
73
95
  static ggml_cl_version parse_cl_version(std::string_view str) {
74
96
  size_t major_str_begin = 0;
@@ -173,24 +195,30 @@ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
173
195
  return ADRENO_GPU_GEN::ADRENO_UNKNOWN;
174
196
  }
175
197
 
176
- static int get_adreno_cl_compiler_version(const char *driver_version) {
198
+ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *driver_version) {
177
199
  std::string driver_ver_str(driver_version);
200
+ ADRENO_CL_COMPILER_TYPE type = ADRENO_CL_COMPILER_TYPE::E031;
178
201
  size_t compiler_ver_pos = driver_ver_str.find("E031");
179
202
  size_t compiler_ver_len = 13;
180
- size_t compiler_ver_offset = 5;
203
+ size_t compiler_major_offset = 5;
204
+ size_t compiler_minor_offset = 8;
205
+ size_t compiler_patch_offset = 11;
181
206
 
182
207
  if (compiler_ver_pos == std::string::npos) {
183
208
  compiler_ver_pos = driver_ver_str.find("DX");
184
209
  if (compiler_ver_pos == std::string::npos) {
185
- return -1;
210
+ return {};
186
211
  }
212
+ type = ADRENO_CL_COMPILER_TYPE::DX;
187
213
  compiler_ver_len = 11;
188
- compiler_ver_offset = 3;
214
+ compiler_major_offset = 3;
189
215
  }
190
216
 
191
217
  std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len);
192
- std::string major_ver_str = compiler_ver_str.substr(compiler_ver_offset, 2);
193
- return std::atoi(major_ver_str.c_str());
218
+ int major = std::atoi(compiler_ver_str.substr(compiler_major_offset, 2).c_str());
219
+ int minor = std::atoi(compiler_ver_str.substr(compiler_minor_offset, 2).c_str());
220
+ int patch = std::atoi(compiler_ver_str.substr(compiler_patch_offset, 2).c_str());
221
+ return { type, major, minor, patch };
194
222
  }
195
223
 
196
224
  // backend device context
@@ -215,21 +243,55 @@ struct ggml_backend_opencl_context {
215
243
  cl_int alignment;
216
244
  size_t max_alloc_size;
217
245
  bool fp16_support;
246
+ bool has_vector_subgroup_broadcast;
247
+ ggml_cl_compiler_version adreno_cl_compiler_version;
218
248
 
219
249
  int adreno_wave_size;
220
250
 
221
251
  cl_context context;
222
252
  cl_command_queue queue;
223
253
 
224
- cl_program program;
225
- cl_program program_1;
226
- cl_program program_2;
254
+ cl_program program_add;
255
+ cl_program program_clamp;
256
+ cl_program program_cpy;
257
+ cl_program program_cvt;
258
+ cl_program program_diag_mask_inf;
259
+ cl_program program_gelu;
260
+ cl_program program_gemv_noshuffle_general;
261
+ cl_program program_gemv_noshuffle;
262
+ cl_program program_get_rows;
263
+ cl_program program_im2col_f16;
264
+ cl_program program_im2col_f32;
265
+ cl_program program_mul_mat_Ab_Bi_8x4;
266
+ cl_program program_mul_mv_q4_0_f32;
267
+ cl_program program_mul_mv_q4_0_f32_v;
268
+ cl_program program_mul_mv_q4_0_f32_8x_flat;
269
+ cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
270
+ cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
271
+ cl_program program_mul_mv_q6_K;
272
+ cl_program program_mul_mv_f16_f16;
273
+ cl_program program_mul_mv_f16_f32_1row;
274
+ cl_program program_mul_mv_f16_f32_l4;
275
+ cl_program program_mul_mv_f16_f32;
276
+ cl_program program_mul_mv_f32_f32;
277
+ cl_program program_mul;
278
+ cl_program program_norm;
279
+ cl_program program_relu;
280
+ cl_program program_rms_norm;
281
+ cl_program program_rope;
282
+ cl_program program_scale;
283
+ cl_program program_silu;
284
+ cl_program program_softmax_f32;
285
+ cl_program program_softmax_f16;
286
+ cl_program program_softmax_4_f32;
287
+ cl_program program_softmax_4_f16;
227
288
 
228
289
  cl_kernel kernel_add, kernel_add_row;
229
290
  cl_kernel kernel_mul, kernel_mul_row;
230
291
  cl_kernel kernel_scale;
231
292
  cl_kernel kernel_silu, kernel_silu_4;
232
293
  cl_kernel kernel_gelu, kernel_gelu_4;
294
+ cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
233
295
  cl_kernel kernel_relu;
234
296
  cl_kernel kernel_clamp;
235
297
  cl_kernel kernel_norm;
@@ -239,6 +301,7 @@ struct ggml_backend_opencl_context {
239
301
  cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
240
302
  cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
241
303
  cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
304
+ cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
242
305
  cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
243
306
  cl_kernel kernel_mul_mat_f32_f32;
244
307
  cl_kernel kernel_mul_mat_f16_f16;
@@ -246,18 +309,17 @@ struct ggml_backend_opencl_context {
246
309
  cl_kernel kernel_mul_mat_f16_f32;
247
310
  cl_kernel kernel_mul_mat_f16_f32_l4;
248
311
  cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
249
- cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat;
312
+ cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
250
313
  cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
251
- cl_kernel kernel_convert_block_q4_0_noshuffle, kernel_mul_mat_q4_0_f32_flat_v0,
252
- kernel_mul_mat_q4_0_f32_flat_img_v0;
314
+ cl_kernel kernel_convert_block_q4_0_noshuffle;
253
315
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
254
316
  cl_kernel kernel_mul_mv_q6_K_f32;
317
+ cl_kernel kernel_im2col_f32, kernel_im2col_f16;
255
318
 
256
319
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
257
320
  // Transpose kernels
258
- cl_program program_transpose_32;
259
- cl_program program_transpose_32_16;
260
- cl_program program_transpose_16;
321
+ cl_program program_transpose;
322
+
261
323
  cl_kernel kernel_transpose_32;
262
324
  cl_kernel kernel_transpose_32_16;
263
325
  cl_kernel kernel_transpose_16;
@@ -370,6 +432,681 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
370
432
  return p;
371
433
  }
372
434
 
435
+ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_version opencl_c_version) {
436
+ cl_int err;
437
+
438
+ // compiler options for general kernels
439
+ auto opencl_c_std =
440
+ std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
441
+ std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
442
+ " -cl-mad-enable -cl-unsafe-math-optimizations"
443
+ " -cl-finite-math-only -cl-fast-relaxed-math";
444
+
445
+ GGML_LOG_INFO("ggml_opencl: loading OpenCL kernels");
446
+
447
+ // add
448
+ {
449
+ #ifdef GGML_OPENCL_EMBED_KERNELS
450
+ const std::string kernel_src {
451
+ #include "add.cl.h"
452
+ };
453
+ #else
454
+ const std::string kernel_src = read_file("add.cl");
455
+ #endif
456
+ backend_ctx->program_add =
457
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
458
+
459
+ CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
460
+ CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
461
+ GGML_LOG_CONT(".");
462
+ }
463
+
464
+ // clamp
465
+ {
466
+ #ifdef GGML_OPENCL_EMBED_KERNELS
467
+ const std::string kernel_src {
468
+ #include "clamp.cl.h"
469
+ };
470
+ #else
471
+ const std::string kernel_src = read_file("clamp.cl");
472
+ #endif
473
+ backend_ctx->program_clamp =
474
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
475
+
476
+ CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program_clamp, "kernel_clamp", &err), err));
477
+ GGML_LOG_CONT(".");
478
+ }
479
+
480
+ // cpy
481
+ {
482
+ #ifdef GGML_OPENCL_EMBED_KERNELS
483
+ const std::string kernel_src {
484
+ #include "cpy.cl.h"
485
+ };
486
+ #else
487
+ const std::string kernel_src = read_file("cpy.cl");
488
+ #endif
489
+ backend_ctx->program_cpy =
490
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
491
+
492
+ CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f16", &err), err));
493
+ CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f32", &err), err));
494
+ CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f16", &err), err));
495
+ CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f32", &err), err));
496
+ GGML_LOG_CONT(".");
497
+ }
498
+
499
+ // cvt
500
+ {
501
+ #ifdef GGML_OPENCL_EMBED_KERNELS
502
+ const std::string kernel_src {
503
+ #include "cvt.cl.h"
504
+ };
505
+ #else
506
+ const std::string kernel_src = read_file("cvt.cl");
507
+ #endif
508
+ backend_ctx->program_cvt =
509
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
510
+
511
+ CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
512
+ CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
513
+ CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
514
+ GGML_LOG_CONT(".");
515
+ }
516
+
517
+ // diag_mask_inf
518
+ {
519
+ #ifdef GGML_OPENCL_EMBED_KERNELS
520
+ const std::string kernel_src {
521
+ #include "diag_mask_inf.cl.h"
522
+ };
523
+ #else
524
+ const std::string kernel_src = read_file("diag_mask_inf.cl");
525
+ #endif
526
+ backend_ctx->program_diag_mask_inf =
527
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
528
+
529
+ CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program_diag_mask_inf, "kernel_diag_mask_inf_8", &err), err));
530
+ CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program_diag_mask_inf, "kernel_diag_mask_inf", &err), err));
531
+ GGML_LOG_CONT(".");
532
+ }
533
+
534
+ // gelu
535
+ {
536
+ #ifdef GGML_OPENCL_EMBED_KERNELS
537
+ const std::string kernel_src {
538
+ #include "gelu.cl.h"
539
+ };
540
+ #else
541
+ const std::string kernel_src = read_file("gelu.cl");
542
+ #endif
543
+ backend_ctx->program_gelu =
544
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
545
+
546
+ CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
547
+ CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
548
+ CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
549
+ CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
550
+ GGML_LOG_CONT(".");
551
+ }
552
+
553
+ // get_rows
554
+ {
555
+ #ifdef GGML_OPENCL_EMBED_KERNELS
556
+ const std::string kernel_src {
557
+ #include "get_rows.cl.h"
558
+ };
559
+ #else
560
+ const std::string kernel_src = read_file("get_rows.cl");
561
+ #endif
562
+ backend_ctx->program_get_rows =
563
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
564
+
565
+ CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_f32", &err), err));
566
+ CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_f16", &err), err));
567
+ CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_q4_0", &err), err));
568
+ GGML_LOG_CONT(".");
569
+ }
570
+
571
+ // im2col_f32
572
+ {
573
+ #ifdef GGML_OPENCL_EMBED_KERNELS
574
+ const std::string kernel_src {
575
+ #include "im2col_f32.cl.h"
576
+ };
577
+ #else
578
+ const std::string kernel_src = read_file("im2col_f32.cl");
579
+ #endif
580
+ backend_ctx->program_im2col_f32 =
581
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
582
+
583
+ CL_CHECK((backend_ctx->kernel_im2col_f32 = clCreateKernel(backend_ctx->program_im2col_f32, "kernel_im2col_f32", &err), err));
584
+ GGML_LOG_CONT(".");
585
+ }
586
+
587
+ // im2col_f16
588
+ {
589
+ #ifdef GGML_OPENCL_EMBED_KERNELS
590
+ const std::string kernel_src {
591
+ #include "im2col_f16.cl.h"
592
+ };
593
+ #else
594
+ const std::string kernel_src = read_file("im2col_f16.cl");
595
+ #endif
596
+ backend_ctx->program_im2col_f16 =
597
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
598
+
599
+ CL_CHECK((backend_ctx->kernel_im2col_f16 = clCreateKernel(backend_ctx->program_im2col_f16, "kernel_im2col_f16", &err), err));
600
+ GGML_LOG_CONT(".");
601
+ }
602
+
603
+ // mul_mv_q4_0_f32
604
+ {
605
+ #ifdef GGML_OPENCL_EMBED_KERNELS
606
+ const std::string kernel_src {
607
+ #include "mul_mv_q4_0_f32.cl.h"
608
+ };
609
+ #else
610
+ const std::string kernel_src = read_file("mul_mv_q4_0_f32.cl");
611
+ #endif
612
+ backend_ctx->program_mul_mv_q4_0_f32 =
613
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
614
+
615
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32, "kernel_mul_mat_q4_0_f32", &err), err));
616
+ GGML_LOG_CONT(".");
617
+ }
618
+
619
+ // mul_mv_q4_0_f32_v
620
+ {
621
+ #ifdef GGML_OPENCL_EMBED_KERNELS
622
+ const std::string kernel_src {
623
+ #include "mul_mv_q4_0_f32_v.cl.h"
624
+ };
625
+ #else
626
+ const std::string kernel_src = read_file("mul_mv_q4_0_f32_v.cl");
627
+ #endif
628
+ backend_ctx->program_mul_mv_q4_0_f32_v =
629
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
630
+
631
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_v, "kernel_mul_mat_q4_0_f32_v", &err), err));
632
+ GGML_LOG_CONT(".");
633
+ }
634
+
635
+ // mul_mv_q4_0_f32_8x_flat
636
+ {
637
+ #ifdef GGML_OPENCL_EMBED_KERNELS
638
+ const std::string kernel_src {
639
+ #include "mul_mv_q4_0_f32_8x_flat.cl.h"
640
+ };
641
+ #else
642
+ const std::string kernel_src = read_file("mul_mv_q4_0_f32_8x_flat.cl");
643
+ #endif
644
+ backend_ctx->program_mul_mv_q4_0_f32_8x_flat =
645
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
646
+
647
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_8x_flat, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
648
+ GGML_LOG_CONT(".");
649
+ }
650
+
651
+ // mul_mv_q4_0_f32_1d_8x_flat
652
+ // This kernel does not compiler on Adreno cl compiler 38.01. Skip it for
653
+ // those compiler versions since it is anyway not used for Adreno.
654
+ if (backend_ctx->gpu_family != ADRENO ||
655
+ backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) ||
656
+ backend_ctx->adreno_cl_compiler_version.type == DX) {
657
+ #ifdef GGML_OPENCL_EMBED_KERNELS
658
+ const std::string kernel_src {
659
+ #include "mul_mv_q4_0_f32_1d_8x_flat.cl.h"
660
+ };
661
+ #else
662
+ const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_8x_flat.cl");
663
+ #endif
664
+ backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat =
665
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
666
+
667
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
668
+ GGML_LOG_CONT(".");
669
+ }
670
+
671
+ // mul_mv_q4_0_f32_1d_16x_flat
672
+ // This kernel does not compiler on Adreno cl compiler 38.01. Skip it for
673
+ // those compiler versions since it is anyway not used for Adreno.
674
+ if (backend_ctx->gpu_family != ADRENO ||
675
+ backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) ||
676
+ backend_ctx->adreno_cl_compiler_version.type == DX) {
677
+ #ifdef GGML_OPENCL_EMBED_KERNELS
678
+ const std::string kernel_src {
679
+ #include "mul_mv_q4_0_f32_1d_16x_flat.cl.h"
680
+ };
681
+ #else
682
+ const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_16x_flat.cl");
683
+ #endif
684
+ backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat =
685
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
686
+
687
+ CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
688
+ GGML_LOG_CONT(".");
689
+ }
690
+
691
+ // mul_mv_q6_k
692
+ {
693
+ #ifdef GGML_OPENCL_EMBED_KERNELS
694
+ const std::string kernel_src {
695
+ #include "mul_mv_q6_k.cl.h"
696
+ };
697
+ #else
698
+ const std::string kernel_src = read_file("mul_mv_q6_k.cl");
699
+ #endif
700
+ backend_ctx->program_mul_mv_q6_K =
701
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
702
+
703
+ CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_mul_mv_q6_K, "kernel_mul_mv_q6_K_f32", &err), err));
704
+ GGML_LOG_CONT(".");
705
+ }
706
+
707
+ // mul_mv_f16_f16
708
+ {
709
+ #ifdef GGML_OPENCL_EMBED_KERNELS
710
+ const std::string kernel_src {
711
+ #include "mul_mv_f16_f16.cl.h"
712
+ };
713
+ #else
714
+ const std::string kernel_src = read_file("mul_mv_f16_f16.cl");
715
+ #endif
716
+ backend_ctx->program_mul_mv_f16_f16 =
717
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
718
+
719
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program_mul_mv_f16_f16, "kernel_mul_mat_f16_f16", &err), err));
720
+ GGML_LOG_CONT(".");
721
+ }
722
+
723
+ // mul_mv_f16_f32_1row
724
+ {
725
+ #ifdef GGML_OPENCL_EMBED_KERNELS
726
+ const std::string kernel_src {
727
+ #include "mul_mv_f16_f32_1row.cl.h"
728
+ };
729
+ #else
730
+ const std::string kernel_src = read_file("mul_mv_f16_f32_1row.cl");
731
+ #endif
732
+ backend_ctx->program_mul_mv_f16_f32_1row =
733
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
734
+
735
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_1row, "kernel_mul_mat_f16_f32_1row", &err), err));
736
+ GGML_LOG_CONT(".");
737
+ }
738
+
739
+ // mul_mv_f16_f32_l4
740
+ {
741
+ #ifdef GGML_OPENCL_EMBED_KERNELS
742
+ const std::string kernel_src {
743
+ #include "mul_mv_f16_f32_l4.cl.h"
744
+ };
745
+ #else
746
+ const std::string kernel_src = read_file("mul_mv_f16_f32_l4.cl");
747
+ #endif
748
+ backend_ctx->program_mul_mv_f16_f32_l4 =
749
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
750
+
751
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
752
+ GGML_LOG_CONT(".");
753
+ }
754
+
755
+ // mul_mv_f16_f32
756
+ {
757
+ #ifdef GGML_OPENCL_EMBED_KERNELS
758
+ const std::string kernel_src {
759
+ #include "mul_mv_f16_f32.cl.h"
760
+ };
761
+ #else
762
+ const std::string kernel_src = read_file("mul_mv_f16_f32.cl");
763
+ #endif
764
+ backend_ctx->program_mul_mv_f16_f32 =
765
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
766
+
767
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32, "kernel_mul_mat_f16_f32", &err), err));
768
+ GGML_LOG_CONT(".");
769
+ }
770
+
771
+ // mul_mv_f32_f32
772
+ {
773
+ #ifdef GGML_OPENCL_EMBED_KERNELS
774
+ const std::string kernel_src {
775
+ #include "mul_mv_f32_f32.cl.h"
776
+ };
777
+ #else
778
+ const std::string kernel_src = read_file("mul_mv_f32_f32.cl");
779
+ #endif
780
+ backend_ctx->program_mul_mv_f32_f32 =
781
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
782
+
783
+ CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program_mul_mv_f32_f32, "kernel_mul_mat_f32_f32", &err), err));
784
+ GGML_LOG_CONT(".");
785
+ }
786
+
787
+ // mul
788
+ {
789
+ #ifdef GGML_OPENCL_EMBED_KERNELS
790
+ const std::string kernel_src {
791
+ #include "mul.cl.h"
792
+ };
793
+ #else
794
+ const std::string kernel_src = read_file("mul.cl");
795
+ #endif
796
+ backend_ctx->program_mul =
797
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
798
+
799
+ CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
800
+ CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
801
+ GGML_LOG_CONT(".");
802
+ }
803
+
804
+ // norm
805
+ {
806
+ #ifdef GGML_OPENCL_EMBED_KERNELS
807
+ const std::string kernel_src {
808
+ #include "norm.cl.h"
809
+ };
810
+ #else
811
+ const std::string kernel_src = read_file("norm.cl");
812
+ #endif
813
+ backend_ctx->program_norm =
814
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
815
+
816
+ CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program_norm, "kernel_norm", &err), err));
817
+ GGML_LOG_CONT(".");
818
+ }
819
+
820
+ // relu
821
+ {
822
+ #ifdef GGML_OPENCL_EMBED_KERNELS
823
+ const std::string kernel_src {
824
+ #include "relu.cl.h"
825
+ };
826
+ #else
827
+ const std::string kernel_src = read_file("relu.cl");
828
+ #endif
829
+ backend_ctx->program_relu =
830
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
831
+
832
+ CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program_relu, "kernel_relu", &err), err));
833
+ GGML_LOG_CONT(".");
834
+ }
835
+
836
+ // rms_norm
837
+ {
838
+ #ifdef GGML_OPENCL_EMBED_KERNELS
839
+ const std::string kernel_src {
840
+ #include "rms_norm.cl.h"
841
+ };
842
+ #else
843
+ const std::string kernel_src = read_file("rms_norm.cl");
844
+ #endif
845
+ backend_ctx->program_rms_norm =
846
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
847
+
848
+ CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err));
849
+ GGML_LOG_CONT(".");
850
+ }
851
+
852
+ // rope
853
+ {
854
+ #ifdef GGML_OPENCL_EMBED_KERNELS
855
+ const std::string kernel_src {
856
+ #include "rope.cl.h"
857
+ };
858
+ #else
859
+ const std::string kernel_src = read_file("rope.cl");
860
+ #endif
861
+ backend_ctx->program_rope =
862
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
863
+
864
+ CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_norm_f32", &err), err));
865
+ CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_norm_f16", &err), err));
866
+ CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_neox_f32", &err), err));
867
+ CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_neox_f16", &err), err));
868
+ CL_CHECK((backend_ctx->kernel_rope_multi_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_multi_f32", &err), err));
869
+ CL_CHECK((backend_ctx->kernel_rope_multi_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_multi_f16", &err), err));
870
+ CL_CHECK((backend_ctx->kernel_rope_vision_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_vision_f32", &err), err));
871
+ CL_CHECK((backend_ctx->kernel_rope_vision_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_vision_f16", &err), err));
872
+ GGML_LOG_CONT(".");
873
+ }
874
+
875
+ // scale
876
+ {
877
+ #ifdef GGML_OPENCL_EMBED_KERNELS
878
+ const std::string kernel_src {
879
+ #include "scale.cl.h"
880
+ };
881
+ #else
882
+ const std::string kernel_src = read_file("scale.cl");
883
+ #endif
884
+ backend_ctx->program_scale =
885
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
886
+
887
+ CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program_scale, "kernel_scale", &err), err));
888
+ GGML_LOG_CONT(".");
889
+ }
890
+
891
+ // silu
892
+ {
893
+ #ifdef GGML_OPENCL_EMBED_KERNELS
894
+ const std::string kernel_src {
895
+ #include "silu.cl.h"
896
+ };
897
+ #else
898
+ const std::string kernel_src = read_file("silu.cl");
899
+ #endif
900
+ backend_ctx->program_silu =
901
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
902
+
903
+ CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program_silu, "kernel_silu", &err), err));
904
+ CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program_silu, "kernel_silu_4", &err), err));
905
+ GGML_LOG_CONT(".");
906
+ }
907
+
908
+ // softmax_f32
909
+ {
910
+ #ifdef GGML_OPENCL_EMBED_KERNELS
911
+ const std::string kernel_src {
912
+ #include "softmax_f32.cl.h"
913
+ };
914
+ #else
915
+ const std::string kernel_src = read_file("softmax_f32.cl");
916
+ #endif
917
+ backend_ctx->program_softmax_f32 =
918
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
919
+
920
+ CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program_softmax_f32, "kernel_soft_max", &err), err));
921
+ GGML_LOG_CONT(".");
922
+ }
923
+
924
+ // softmax_f16
925
+ {
926
+ #ifdef GGML_OPENCL_EMBED_KERNELS
927
+ const std::string kernel_src {
928
+ #include "softmax_f16.cl.h"
929
+ };
930
+ #else
931
+ const std::string kernel_src = read_file("softmax_f16.cl");
932
+ #endif
933
+ backend_ctx->program_softmax_f16 =
934
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
935
+
936
+ CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program_softmax_f16, "kernel_soft_max_f16", &err), err));
937
+ GGML_LOG_CONT(".");
938
+ }
939
+
940
+ // softmax_4_f32
941
+ {
942
+ #ifdef GGML_OPENCL_EMBED_KERNELS
943
+ const std::string kernel_src {
944
+ #include "softmax_4_f32.cl.h"
945
+ };
946
+ #else
947
+ const std::string kernel_src = read_file("softmax_4_f32.cl");
948
+ #endif
949
+ backend_ctx->program_softmax_4_f32 =
950
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
951
+
952
+ CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program_softmax_4_f32, "kernel_soft_max_4", &err), err));
953
+ GGML_LOG_CONT(".");
954
+ }
955
+
956
+ // softmax_4_f16
957
+ {
958
+ #ifdef GGML_OPENCL_EMBED_KERNELS
959
+ const std::string kernel_src {
960
+ #include "softmax_4_f16.cl.h"
961
+ };
962
+ #else
963
+ const std::string kernel_src = read_file("softmax_4_f16.cl");
964
+ #endif
965
+ backend_ctx->program_softmax_4_f16 =
966
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
967
+
968
+ CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program_softmax_4_f16, "kernel_soft_max_4_f16", &err), err));
969
+ GGML_LOG_CONT(".");
970
+ }
971
+
972
+ // Adreno kernels
973
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
974
+ // transpose
975
+ {
976
+ #ifdef GGML_OPENCL_EMBED_KERNELS
977
+ const std::string kernel_src {
978
+ #include "transpose.cl.h"
979
+ };
980
+ #else
981
+ const std::string kernel_src = read_file("transpose.cl");
982
+ #endif
983
+ backend_ctx->program_transpose =
984
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
985
+
986
+ CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
987
+ CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
988
+ CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
989
+ GGML_LOG_CONT(".");
990
+ }
991
+
992
+ // gemv_noshuffle_general
993
+ {
994
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
995
+ " -cl-mad-enable "
996
+ " -DSIMDGROUP_WIDTH=" +
997
+ std::to_string(backend_ctx->adreno_wave_size);
998
+ if (backend_ctx->has_vector_subgroup_broadcast) {
999
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1000
+ }
1001
+
1002
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1003
+ const std::string kernel_src_CL_gemv_general {
1004
+ #include "gemv_noshuffle_general.cl.h"
1005
+ };
1006
+ #else
1007
+ const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general.cl");
1008
+ #endif
1009
+
1010
+ backend_ctx->program_CL_gemv_general = build_program_from_source(
1011
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
1012
+
1013
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
1014
+ GGML_LOG_CONT(".");
1015
+ }
1016
+
1017
+ // gemv_noshuffle
1018
+ {
1019
+ // Gemv 2048, 16384
1020
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1021
+ " -cl-mad-enable "
1022
+ " -DLINE_STRIDE_A=2048 "
1023
+ " -DBLOCK_STRIDE_A=16384 "
1024
+ " -DSIMDGROUP_WIDTH=" +
1025
+ std::to_string(backend_ctx->adreno_wave_size);
1026
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1027
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1028
+ }
1029
+
1030
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1031
+ const std::string kernel_src_CL_gemv {
1032
+ #include "gemv_noshuffle.cl.h"
1033
+ };
1034
+ #else
1035
+ const std::string kernel_src_CL_gemv = read_file("gemv_noshuffle.cl");
1036
+ #endif
1037
+
1038
+ backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
1039
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1040
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
1041
+ GGML_LOG_CONT(".");
1042
+
1043
+ // Gemv 2048, 16384
1044
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1045
+ " -cl-mad-enable "
1046
+ " -DLINE_STRIDE_A=2048 "
1047
+ " -DBLOCK_STRIDE_A=16384 "
1048
+ " -DSIMDGROUP_WIDTH=" +
1049
+ std::to_string(backend_ctx->adreno_wave_size);
1050
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1051
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1052
+ }
1053
+
1054
+ backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
1055
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1056
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
1057
+ GGML_LOG_CONT(".");
1058
+
1059
+ // Gemv 5504, 44032
1060
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1061
+ " -cl-mad-enable "
1062
+ " -DLINE_STRIDE_A=5504 "
1063
+ " -DBLOCK_STRIDE_A=44032 "
1064
+ " -DSIMDGROUP_WIDTH=" +
1065
+ std::to_string(backend_ctx->adreno_wave_size);
1066
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1067
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1068
+ }
1069
+
1070
+ backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
1071
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1072
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
1073
+ GGML_LOG_CONT(".");
1074
+
1075
+ // Gemv 16000, 128000
1076
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
1077
+ " -cl-mad-enable "
1078
+ " -DLINE_STRIDE_A=16000 "
1079
+ " -DBLOCK_STRIDE_A=128000 "
1080
+ " -DSIMDGROUP_WIDTH=" +
1081
+ std::to_string(backend_ctx->adreno_wave_size);
1082
+
1083
+ if (backend_ctx->has_vector_subgroup_broadcast) {
1084
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1085
+ }
1086
+
1087
+ backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(
1088
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
1089
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
1090
+ GGML_LOG_CONT(".");
1091
+ }
1092
+
1093
+ // mul_mat_Ab_Bi_8x4
1094
+ {
1095
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1096
+ const std::string kernel_src_CL_gemm {
1097
+ #include "mul_mat_Ab_Bi_8x4.cl.h"
1098
+ };
1099
+ #else
1100
+ const std::string kernel_src_CL_gemm = read_file("mul_mat_Ab_Bi_8x4.cl");
1101
+ #endif
1102
+ backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_CL_gemm.c_str(), compile_opts);
1103
+ CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
1104
+ GGML_LOG_CONT(".");
1105
+ }
1106
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1107
+ GGML_LOG_CONT("\n");
1108
+ }
1109
+
373
1110
  static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
374
1111
  static bool initialized = false;
375
1112
  static ggml_backend_opencl_context *backend_ctx = nullptr;
@@ -411,6 +1148,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
411
1148
  unsigned number;
412
1149
  cl_device_type type;
413
1150
  char name[128];
1151
+ char version[128];
414
1152
  };
415
1153
 
416
1154
  enum { NPLAT = 16, NDEV = 16 };
@@ -451,6 +1189,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
451
1189
  d->platform = p;
452
1190
  CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
453
1191
  CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
1192
+ CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_VERSION, sizeof(d->version), &d->version, NULL));
454
1193
 
455
1194
  if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
456
1195
  p->default_device = d;
@@ -543,7 +1282,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
543
1282
  }
544
1283
 
545
1284
  GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
546
- GGML_LOG_INFO("ggml_opencl: selecting device: '%s'\n", default_device->name);
1285
+ GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
547
1286
  if (default_device->type != CL_DEVICE_TYPE_GPU) {
548
1287
  GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
549
1288
  }
@@ -552,9 +1291,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
552
1291
  dev_ctx->device = default_device->id;
553
1292
  backend_ctx->device = default_device->id;
554
1293
 
555
- if (strstr(default_device->name, "Adreno")) {
1294
+ if (strstr(default_device->name, "Adreno") ||
1295
+ strstr(default_device->name, "Qualcomm") ||
1296
+ strstr(default_device->version, "Adreno")) {
556
1297
  backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
557
- backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
1298
+ // Usually device version contains the detailed device name
1299
+ backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
1300
+ if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
1301
+ backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
1302
+ }
558
1303
 
559
1304
  // Use wave size of 64 for all Adreno GPUs.
560
1305
  backend_ctx->adreno_wave_size = 64;
@@ -600,11 +1345,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
600
1345
  GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
601
1346
  backend_ctx->driver_version = driver_version;
602
1347
 
603
- int adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
604
- bool has_vector_subgroup_broadcast =
605
- adreno_cl_compiler_version >= 47 || adreno_cl_compiler_version == 17;
1348
+ backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
1349
+ backend_ctx->has_vector_subgroup_broadcast =
1350
+ backend_ctx->adreno_cl_compiler_version.major >= 47 ||
1351
+ backend_ctx->adreno_cl_compiler_version.major == 17;
606
1352
  GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
607
- has_vector_subgroup_broadcast ? "true" : "false");
1353
+ backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
608
1354
 
609
1355
  size_t ext_str_size;
610
1356
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
@@ -679,230 +1425,32 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
679
1425
  #endif
680
1426
  CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
681
1427
 
682
- #ifdef GGML_OPENCL_EMBED_KERNELS
683
- const std::string kernel_src {
684
- #include "ggml-opencl.cl.h"
685
- };
686
- #else
687
- const std::string kernel_src = read_file("ggml-opencl.cl");
688
- #endif
689
-
690
- auto opencl_c_std =
691
- std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
692
-
693
- std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
694
- " -cl-mad-enable -cl-unsafe-math-optimizations"
695
- " -cl-finite-math-only -cl-fast-relaxed-math";
696
- backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
697
-
698
- // Non matmul kernels.
699
- CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f32", &err), err));
700
- CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f16", &err), err));
701
- CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program, "kernel_get_rows_q4_0", &err), err));
702
- CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program, "kernel_add", &err), err));
703
- CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program, "kernel_add_row", &err), err));
704
- CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program, "kernel_mul", &err), err));
705
- CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program, "kernel_mul_row", &err), err));
706
- CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program, "kernel_scale", &err), err));
707
- CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program, "kernel_silu", &err), err));
708
- CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err));
709
- CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err));
710
- CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err));
711
- CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err));
712
- CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err));
713
- CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err));
714
- CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program, "kernel_rms_norm", &err), err));
715
- CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf", &err), err));
716
- CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
717
- CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
718
- CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
719
- CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_f16", &err), err));
720
- CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4_f16", &err), err));
721
- CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
722
- CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
723
- CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
724
- CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err));
725
- CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err));
726
- CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err));
727
- CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err));
728
- CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f32", &err), err));
729
-
730
- // Matmul kernels.
731
- CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f32_f32", &err), err));
732
- CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f16", &err), err));
733
- CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_1row", &err), err));
734
- CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32", &err), err));
735
- CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_l4", &err), err));
736
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32", &err), err));
737
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_v", &err), err));
738
-
739
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_flat", &err), err));
740
- CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_convert_block_q4_0", &err), err));
741
- CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_restore_block_q4_0", &err), err));
742
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
743
-
744
- // Load additional mulmat kernels.
745
- #ifdef GGML_OPENCL_EMBED_KERNELS
746
- const std::string kernel_src_1 {
747
- #include "ggml-opencl_mm.cl.h"
748
- };
749
- #else
750
- const std::string kernel_src_1 = read_file("ggml-opencl_mm.cl");
751
- #endif
752
- backend_ctx->program_1 = build_program_from_source(context, device, kernel_src_1.c_str(), compile_opts);
753
-
754
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
755
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
756
- CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mv_q6_K_f32", &err), err));
757
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_v0", &err), err));
758
- CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_img_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_img_v0", &err), err));
1428
+ // Load kernels
1429
+ load_cl_kernels(backend_ctx, opencl_c_version);
759
1430
 
760
- // Load additional data conversion kernels.
761
- #ifdef GGML_OPENCL_EMBED_KERNELS
762
- const std::string kernel_src_2 {
763
- #include "ggml-opencl_cvt.cl.h"
764
- };
765
- #else
766
- const std::string kernel_src_2 = read_file("ggml-opencl_cvt.cl");
767
- #endif
768
- backend_ctx->program_2 = build_program_from_source(context, device, kernel_src_2.c_str(), compile_opts);
769
-
770
- CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err));
771
-
772
- // Kernels for Adreno
773
1431
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
774
- #ifdef GGML_OPENCL_EMBED_KERNELS
775
- const std::string transpose_32_src {
776
- #include "ggml-opencl_transpose_32.cl.h"
777
- };
778
- #else
779
- const std::string transpose_32_src = read_file("ggml-opencl_transpose_32.cl");
780
- #endif
781
- backend_ctx->program_transpose_32 = build_program_from_source(context, device, transpose_32_src.c_str(), compile_opts);
782
- CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose_32, "kernel_transpose_32", &err), err));
783
-
784
- #ifdef GGML_OPENCL_EMBED_KERNELS
785
- const std::string transpose_32_16_src {
786
- #include "ggml-opencl_transpose_32_16.cl.h"
787
- };
788
- #else
789
- const std::string transpose_32_16_src = read_file("ggml-opencl_transpose_32_16.cl");
790
- #endif
791
- backend_ctx->program_transpose_32_16 = build_program_from_source(context, device, transpose_32_16_src.c_str(), compile_opts);
792
- CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose_32_16, "kernel_transpose_32_16", &err), err));
793
-
794
- #ifdef GGML_OPENCL_EMBED_KERNELS
795
- const std::string transpose_16_src {
796
- #include "ggml-opencl_transpose_16.cl.h"
797
- };
798
- #else
799
- const std::string transpose_16_src = read_file("ggml-opencl_transpose_16.cl");
800
- #endif
801
- backend_ctx->program_transpose_16 = build_program_from_source(context, device, transpose_16_src.c_str(), compile_opts);
802
- CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
803
-
804
- // Gemv general
805
- std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
806
- " -cl-mad-enable "
807
- " -DSIMDGROUP_WIDTH=" +
808
- std::to_string(backend_ctx->adreno_wave_size);
809
- if (has_vector_subgroup_broadcast) {
810
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
811
- }
812
- #ifdef GGML_OPENCL_EMBED_KERNELS
813
- const std::string kernel_src_CL_gemv_general {
814
- #include "ggml-opencl_gemv_noshuffle_general.cl.h"
815
- };
816
- #else
817
- const std::string kernel_src_CL_gemv_general = read_file("ggml-opencl_gemv_noshuffle_general.cl");
818
- #endif
819
-
820
- backend_ctx->program_CL_gemv_general = build_program_from_source(
821
- context, device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
822
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
823
-
824
- // Gemv 2048, 16384
825
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
826
- " -cl-mad-enable "
827
- " -DLINE_STRIDE_A=2048 "
828
- " -DBLOCK_STRIDE_A=16384 "
829
- " -DSIMDGROUP_WIDTH=" +
830
- std::to_string(backend_ctx->adreno_wave_size);
831
- if (has_vector_subgroup_broadcast) {
832
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1432
+ // Allocate intermediate buffers and images
1433
+ size_t required_A_q_d_bytes = 311164928;
1434
+ size_t required_A_s_d_bytes = 38895616;
1435
+ size_t required_B_d_bytes = 45088768;
1436
+
1437
+ // Ensure buffer sizes do not exceed the maximum allocation size
1438
+ size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, backend_ctx->max_alloc_size);
1439
+ size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, backend_ctx->max_alloc_size);
1440
+ size_t max_B_d_bytes = MIN(required_B_d_bytes, backend_ctx->max_alloc_size);
1441
+ if (required_A_q_d_bytes > backend_ctx->max_alloc_size) {
1442
+ GGML_LOG_WARN("ggml_opencl: A_q_d buffer size reduced from %zu to %zu due to device limitations.\n",
1443
+ required_A_q_d_bytes, max_A_q_d_bytes);
833
1444
  }
834
- #ifdef GGML_OPENCL_EMBED_KERNELS
835
- const std::string kernel_src_CL_gemv {
836
- #include "ggml-opencl_gemv_noshuffle.cl.h"
837
- };
838
- #else
839
- const std::string kernel_src_CL_gemv = read_file("ggml-opencl_gemv_noshuffle.cl");
840
- #endif
841
-
842
- backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
843
- context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
844
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
845
-
846
- // Gemv 2048, 16384
847
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
848
- " -cl-mad-enable "
849
- " -DLINE_STRIDE_A=2048 "
850
- " -DBLOCK_STRIDE_A=16384 "
851
- " -DSIMDGROUP_WIDTH=" +
852
- std::to_string(backend_ctx->adreno_wave_size);
853
- if (has_vector_subgroup_broadcast) {
854
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1445
+ if (required_A_s_d_bytes > backend_ctx->max_alloc_size) {
1446
+ GGML_LOG_WARN("ggml_opencl: A_s_d buffer size reduced from %zu to %zu due to device limitations.\n",
1447
+ required_A_s_d_bytes, max_A_s_d_bytes);
855
1448
  }
856
-
857
- backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
858
- context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
859
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
860
-
861
- // Gemv 5504, 44032
862
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
863
- " -cl-mad-enable "
864
- " -DLINE_STRIDE_A=5504 "
865
- " -DBLOCK_STRIDE_A=44032 "
866
- " -DSIMDGROUP_WIDTH=" +
867
- std::to_string(backend_ctx->adreno_wave_size);
868
- if (has_vector_subgroup_broadcast) {
869
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
1449
+ if (required_B_d_bytes > backend_ctx->max_alloc_size) {
1450
+ GGML_LOG_WARN("ggml_opencl: B_d buffer size reduced from %zu to %zu due to device limitations.\n",
1451
+ required_B_d_bytes, max_B_d_bytes);
870
1452
  }
871
1453
 
872
- backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
873
- context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
874
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
875
-
876
- // Gemv 16000, 128000
877
- CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
878
- " -cl-mad-enable "
879
- " -DLINE_STRIDE_A=16000 "
880
- " -DBLOCK_STRIDE_A=128000 "
881
- " -DSIMDGROUP_WIDTH=" +
882
- std::to_string(backend_ctx->adreno_wave_size);
883
- if (has_vector_subgroup_broadcast) {
884
- CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
885
- }
886
-
887
- backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
888
- CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
889
-
890
- // Gemm
891
- #ifdef GGML_OPENCL_EMBED_KERNELS
892
- const std::string kernel_src_CL_gemm {
893
- #include "ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h"
894
- };
895
- #else
896
- const std::string kernel_src_CL_gemm = read_file("ggml-opencl_mul_mat_Ab_Bi_8x4.cl");
897
- #endif
898
- backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts);
899
- CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
900
-
901
- // Allocate intermediate buffers and images
902
- size_t max_A_q_d_bytes = 311164928;
903
- size_t max_A_s_d_bytes = 38895616;
904
- size_t max_B_d_bytes = 45088768;
905
-
906
1454
  CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
907
1455
  CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
908
1456
  CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
@@ -973,7 +1521,7 @@ static void ggml_cl2_free(void) {
973
1521
  info.cmd_complete_duration_ns/1.e6f,
974
1522
  info.cmd_total_duration_ns/1.e6f,
975
1523
  info.global_size[0], info.global_size[1], info.global_size[2],
976
- info.local_size[0], info.local_size[2], info.local_size[2],
1524
+ info.local_size[0], info.local_size[1], info.local_size[2],
977
1525
  info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
978
1526
  }
979
1527
  fclose(fperf);
@@ -1187,6 +1735,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1187
1735
  case GGML_UNARY_OP_GELU:
1188
1736
  case GGML_UNARY_OP_SILU:
1189
1737
  case GGML_UNARY_OP_RELU:
1738
+ case GGML_UNARY_OP_GELU_QUICK:
1190
1739
  return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1191
1740
  default:
1192
1741
  return false;
@@ -1216,14 +1765,26 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1216
1765
  return op->ne[3] == 1;
1217
1766
  case GGML_OP_ROPE: {
1218
1767
  const int mode = ((const int32_t *) op->op_params)[2];
1219
- if (mode & GGML_ROPE_TYPE_MROPE) {
1768
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
1769
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
1770
+ if (is_mrope && !is_vision) {
1771
+ if (op->src[0]->type == GGML_TYPE_F32 ||
1772
+ op->src[0]->type == GGML_TYPE_F16) {
1773
+ return true;
1774
+ }
1220
1775
  return false;
1221
1776
  }
1222
- if (mode & GGML_ROPE_TYPE_VISION) {
1777
+ if (is_vision) {
1778
+ if (op->src[0]->type == GGML_TYPE_F32 ||
1779
+ op->src[0]->type == GGML_TYPE_F16) {
1780
+ return true;
1781
+ }
1223
1782
  return false;
1224
1783
  }
1225
1784
  return true;
1226
1785
  }
1786
+ case GGML_OP_IM2COL:
1787
+ return true;
1227
1788
  default:
1228
1789
  return false;
1229
1790
  }
@@ -1431,8 +1992,15 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
1431
1992
 
1432
1993
  // The optimized gemm and gemv kernels are used for large matrices without batch.
1433
1994
  // tensor is the quantized weights matrix.
1434
- inline bool use_adreno_kernels(const ggml_tensor *tensor) {
1435
- return tensor->ne[0] >= 512 && tensor->ne[1] >= 512 &&
1995
+ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
1996
+ int64_t threshold_ne0 = 512;
1997
+ int64_t threshold_ne1 = 512;
1998
+ if (!backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) &&
1999
+ backend_ctx->adreno_cl_compiler_version.type != DX) {
2000
+ threshold_ne0 = 128;
2001
+ threshold_ne1 = 128;
2002
+ }
2003
+ return tensor->ne[0] >= threshold_ne0 && tensor->ne[1] >= threshold_ne1 &&
1436
2004
  tensor->ne[2] == 1 && tensor->ne[3] == 1;
1437
2005
  }
1438
2006
 
@@ -1510,7 +2078,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1510
2078
  cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
1511
2079
 
1512
2080
  // The optimized kernels need weights in natural order, so unshuffle.
1513
- if (use_adreno_kernels(tensor)) {
2081
+ if (use_adreno_kernels(backend_ctx, tensor)) {
1514
2082
  kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
1515
2083
  }
1516
2084
  #else
@@ -1534,7 +2102,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1534
2102
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1535
2103
  // Only do transpose for large, non batched matrix
1536
2104
  // TODO: use preallocated images instead of sub-buffer then image
1537
- if (use_adreno_kernels(tensor)) {
2105
+ if (use_adreno_kernels(backend_ctx, tensor)) {
1538
2106
  // <----------------------------------------------------------------------------------> //
1539
2107
  // start transpose
1540
2108
  // <----------------------------------------------------------------------------------> //
@@ -2582,6 +3150,53 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
2582
3150
  #endif
2583
3151
  }
2584
3152
 
3153
+ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3154
+ GGML_ASSERT(src0);
3155
+ GGML_ASSERT(src0->extra);
3156
+ GGML_ASSERT(dst);
3157
+ GGML_ASSERT(dst->extra);
3158
+
3159
+ UNUSED(src1);
3160
+
3161
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3162
+ cl_command_queue queue = backend_ctx->queue;
3163
+
3164
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3165
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3166
+
3167
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3168
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3169
+
3170
+ cl_kernel kernel;
3171
+
3172
+ int n = ggml_nelements(dst);
3173
+
3174
+ if (n % 4 == 0) {
3175
+ kernel = backend_ctx->kernel_gelu_quick_4;
3176
+ n /= 4;
3177
+ } else {
3178
+ kernel = backend_ctx->kernel_gelu_quick;
3179
+ }
3180
+
3181
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3182
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3183
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
3184
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
3185
+
3186
+ size_t global_work_size[] = {(size_t)n, 1, 1};
3187
+ size_t local_work_size[] = {64, 1, 1};
3188
+
3189
+ #ifdef GGML_OPENCL_PROFILING
3190
+ cl_event evt;
3191
+ clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
3192
+
3193
+ g_profiling_info.emplace_back();
3194
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3195
+ #else
3196
+ clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
3197
+ #endif
3198
+ }
3199
+
2585
3200
  static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2586
3201
  GGML_ASSERT(src0);
2587
3202
  GGML_ASSERT(src0->extra);
@@ -2788,8 +3403,8 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
2788
3403
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2789
3404
  cl_command_queue queue = backend_ctx->queue;
2790
3405
 
2791
- ggml_backend_opencl_device_context * dev_ctx =
2792
- (ggml_backend_opencl_device_context *)backend->device->context;
3406
+ //ggml_backend_opencl_device_context * dev_ctx =
3407
+ // (ggml_backend_opencl_device_context *)backend->device->context;
2793
3408
 
2794
3409
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2795
3410
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -2820,13 +3435,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
2820
3435
 
2821
3436
  // Note, this kernel declares local memory in kernel args and the size
2822
3437
  // depends on subgroup size.
2823
- // Retrieve subgroup size.
2824
3438
  // Note, this requires OpenCL 2.1 and above
3439
+ // For now we use fixed subgroup size to simplify support for OpenCL 2.0.
2825
3440
  size_t sgs;
2826
- CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
2827
- CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
2828
- sizeof(local_work_size), local_work_size,
2829
- sizeof(size_t), &sgs, NULL));
3441
+ //CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
3442
+ // CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
3443
+ // sizeof(local_work_size), local_work_size,
3444
+ // sizeof(size_t), &sgs, NULL));
3445
+ if (backend_ctx->gpu_family == ADRENO) {
3446
+ sgs = 64;
3447
+ } else if (backend_ctx->gpu_family == INTEL) {
3448
+ sgs = 32;
3449
+ } else {
3450
+ GGML_ASSERT(false && "Unsupported GPU");
3451
+ }
2830
3452
 
2831
3453
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2832
3454
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -2919,7 +3541,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
2919
3541
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
2920
3542
  cl_context context = backend_ctx->context;
2921
3543
 
2922
- if (ne01 && ne1 && use_adreno_kernels(src0)) {
3544
+ if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) {
2923
3545
 
2924
3546
  // init CL objects
2925
3547
  // <--------------------------------------------> //
@@ -3980,6 +4602,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
3980
4602
  float attn_factor;
3981
4603
  float beta_fast;
3982
4604
  float beta_slow;
4605
+ int32_t sections[4];
3983
4606
 
3984
4607
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
3985
4608
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
@@ -3987,29 +4610,62 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
3987
4610
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
3988
4611
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
3989
4612
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
4613
+ memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
3990
4614
 
3991
4615
  const bool is_neox = mode & 2;
4616
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
4617
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
4618
+
4619
+ if (is_mrope) {
4620
+ GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
4621
+ }
4622
+
4623
+ if (is_vision) {
4624
+ GGML_ASSERT(n_dims == ne00/2);
4625
+ }
3992
4626
 
3993
4627
  cl_kernel kernel;
3994
4628
 
3995
- if (!is_neox) {
4629
+ if (is_neox) {
3996
4630
  switch (src0->type) {
3997
4631
  case GGML_TYPE_F32:
3998
- kernel = backend_ctx->kernel_rope_norm_f32;
4632
+ kernel = backend_ctx->kernel_rope_neox_f32;
3999
4633
  break;
4000
4634
  case GGML_TYPE_F16:
4001
- kernel = backend_ctx->kernel_rope_norm_f16;
4635
+ kernel = backend_ctx->kernel_rope_neox_f16;
4636
+ break;
4637
+ default:
4638
+ GGML_ASSERT(false);
4639
+ };
4640
+ } else if (is_mrope && !is_vision) {
4641
+ switch (src0->type) {
4642
+ case GGML_TYPE_F32:
4643
+ kernel = backend_ctx->kernel_rope_multi_f32;
4644
+ break;
4645
+ case GGML_TYPE_F16:
4646
+ kernel = backend_ctx->kernel_rope_multi_f16;
4002
4647
  break;
4003
4648
  default:
4004
4649
  GGML_ASSERT(false);
4005
4650
  };
4651
+ } else if (is_vision) {
4652
+ switch (src0->type) {
4653
+ case GGML_TYPE_F32:
4654
+ kernel = backend_ctx->kernel_rope_vision_f32;
4655
+ break;
4656
+ case GGML_TYPE_F16:
4657
+ kernel = backend_ctx->kernel_rope_vision_f16;
4658
+ break;
4659
+ default:
4660
+ GGML_ASSERT(false);
4661
+ }
4006
4662
  } else {
4007
4663
  switch (src0->type) {
4008
4664
  case GGML_TYPE_F32:
4009
- kernel = backend_ctx->kernel_rope_neox_f32;
4665
+ kernel = backend_ctx->kernel_rope_norm_f32;
4010
4666
  break;
4011
4667
  case GGML_TYPE_F16:
4012
- kernel = backend_ctx->kernel_rope_neox_f16;
4668
+ kernel = backend_ctx->kernel_rope_norm_f16;
4013
4669
  break;
4014
4670
  default:
4015
4671
  GGML_ASSERT(false);
@@ -4049,6 +4705,9 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
4049
4705
  CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor));
4050
4706
  CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast));
4051
4707
  CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow));
4708
+ if (is_mrope || is_vision) {
4709
+ CL_CHECK(clSetKernelArg(kernel, 33, sizeof(int32_t)*4, &sections));
4710
+ }
4052
4711
 
4053
4712
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
4054
4713
  size_t local_work_size[] = {(size_t)nth, 1, 1};
@@ -4064,6 +4723,98 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
4064
4723
  #endif
4065
4724
  }
4066
4725
 
4726
+ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4727
+ GGML_ASSERT(src0);
4728
+ GGML_ASSERT(src1);
4729
+ GGML_ASSERT(src1->extra);
4730
+ GGML_ASSERT(dst);
4731
+ GGML_ASSERT(dst->extra);
4732
+
4733
+ // src0 - filter, src1 - input
4734
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
4735
+ GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
4736
+
4737
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4738
+ cl_command_queue queue = backend_ctx->queue;
4739
+
4740
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
4741
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4742
+
4743
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
4744
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
4745
+
4746
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
4747
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
4748
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
4749
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
4750
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
4751
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
4752
+
4753
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
4754
+
4755
+ const cl_long IC = src1->ne[is_2D ? 2 : 1];
4756
+ const cl_long IH = is_2D ? src1->ne[1] : 1;
4757
+ const cl_long IW = src1->ne[0];
4758
+
4759
+ const cl_long KH = is_2D ? src0->ne[1] : 1;
4760
+ const cl_long KW = src0->ne[0];
4761
+
4762
+ const cl_long OH = is_2D ? dst->ne[2] : 1;
4763
+ const cl_long OW = dst->ne[1];
4764
+
4765
+ // nb is byte offset, src is type float32
4766
+ const cl_ulong delta_offset = src1->nb[is_2D ? 2 : 1]/4;
4767
+ const cl_long batch = src1->ne[is_2D ? 3 : 2];
4768
+ const cl_ulong batch_offset = src1->nb[is_2D ? 3 : 2]/4;
4769
+
4770
+ const cl_long pelements = OW*KW*KH;
4771
+ const cl_long CHW = IC*KH*KW;
4772
+
4773
+ cl_kernel kernel;
4774
+
4775
+ if(dst->type == GGML_TYPE_F16) {
4776
+ kernel = backend_ctx->kernel_im2col_f16;
4777
+ } else {
4778
+ kernel = backend_ctx->kernel_im2col_f32;
4779
+ }
4780
+
4781
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra1->data_device));
4782
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset1));
4783
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4784
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
4785
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &batch_offset));
4786
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &delta_offset));
4787
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &IW));
4788
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &IH));
4789
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &IC));
4790
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &OW));
4791
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_long), &OH));
4792
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_long), &KW));
4793
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_long), &KH));
4794
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_long), &pelements));
4795
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_long), &CHW));
4796
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &s0));
4797
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &s1));
4798
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &p0));
4799
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &p1));
4800
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &d0));
4801
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &d1));
4802
+
4803
+ const int num_blocks = (pelements + 256 - 1) / 256;
4804
+ size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
4805
+ size_t local_work_size[] = {256, 1, 1};
4806
+
4807
+ #ifdef GGML_OPENCL_PROFILING
4808
+ cl_event evt;
4809
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4810
+
4811
+ g_profiling_info.emplace_back();
4812
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4813
+ #else
4814
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4815
+ #endif
4816
+ }
4817
+
4067
4818
  //------------------------------------------------------------------------------
4068
4819
  // Op offloading
4069
4820
  //------------------------------------------------------------------------------
@@ -4104,8 +4855,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
4104
4855
  if (!any_on_device) {
4105
4856
  return false;
4106
4857
  }
4107
- GGML_ASSERT(ggml_is_contiguous(src0));
4108
- GGML_ASSERT(ggml_is_contiguous(src1));
4109
4858
  func = ggml_cl_add;
4110
4859
  break;
4111
4860
  case GGML_OP_MUL:
@@ -4122,6 +4871,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
4122
4871
  }
4123
4872
  func = ggml_cl_gelu;
4124
4873
  break;
4874
+ case GGML_UNARY_OP_GELU_QUICK:
4875
+ if (!any_on_device) {
4876
+ return false;
4877
+ }
4878
+ func = ggml_cl_gelu_quick;
4879
+ break;
4125
4880
  case GGML_UNARY_OP_SILU:
4126
4881
  if (!any_on_device) {
4127
4882
  return false;
@@ -4194,6 +4949,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
4194
4949
  }
4195
4950
  func = ggml_cl_rope;
4196
4951
  break;
4952
+ case GGML_OP_IM2COL:
4953
+ if (!any_on_device) {
4954
+ return false;
4955
+ }
4956
+ func = ggml_cl_im2col;
4957
+ break;
4197
4958
  default:
4198
4959
  return false;
4199
4960
  }