whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (610) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +99 -2
  4. data/ext/extconf.rb +1 -0
  5. data/ext/ruby_whisper.c +20 -4
  6. data/ext/ruby_whisper.h +30 -2
  7. data/ext/ruby_whisper_context.c +216 -124
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +0 -1
  10. data/ext/ruby_whisper_params.c +0 -1
  11. data/ext/ruby_whisper_segment.c +0 -1
  12. data/ext/ruby_whisper_token.c +29 -9
  13. data/ext/ruby_whisper_transcribe.cpp +4 -1
  14. data/ext/ruby_whisper_vad_context.c +48 -1
  15. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +0 -1
  18. data/ext/ruby_whisper_vad_segments.c +0 -1
  19. data/ext/sources/CMakeLists.txt +1 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  22. data/ext/sources/examples/bench/bench.cpp +23 -18
  23. data/ext/sources/examples/cli/cli.cpp +8 -0
  24. data/ext/sources/examples/common-ggml.cpp +2 -0
  25. data/ext/sources/examples/miniaudio.h +4507 -2131
  26. data/ext/sources/examples/server/server.cpp +18 -4
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
  31. data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
  33. data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
  34. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  35. data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
  36. data/ext/sources/examples/talk-llama/llama-context.h +27 -28
  37. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
  38. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
  40. data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
  41. data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
  42. data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
  43. data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
  44. data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
  45. data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
  49. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  50. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  51. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
  52. data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
  53. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
  54. data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
  55. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
  56. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  57. data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
  58. data/ext/sources/examples/talk-llama/llama-model.h +72 -19
  59. data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
  60. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
  61. data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
  62. data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
  63. data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
  64. data/ext/sources/examples/talk-llama/llama.cpp +76 -22
  65. data/ext/sources/examples/talk-llama/llama.h +63 -30
  66. data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
  67. data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
  68. data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
  69. data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
  70. data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
  71. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
  72. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
  73. data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
  74. data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
  75. data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
  76. data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
  77. data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
  78. data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
  79. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
  80. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
  81. data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
  82. data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
  83. data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
  84. data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
  85. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
  86. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  87. data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
  88. data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
  89. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
  90. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
  91. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  92. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
  108. data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
  109. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
  110. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
  111. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
  112. data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
  113. data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
  114. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
  116. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  117. data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
  118. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
  119. data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
  120. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
  121. data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
  122. data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
  123. data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
  124. data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
  125. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
  126. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
  127. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
  128. data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
  129. data/ext/sources/examples/talk-llama/models/models.h +181 -46
  130. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
  131. data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
  132. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
  133. data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
  134. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
  135. data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
  136. data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
  137. data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
  138. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
  139. data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
  140. data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
  141. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  142. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
  143. data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
  144. data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
  145. data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
  146. data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
  147. data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
  148. data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
  149. data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
  150. data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
  151. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
  152. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
  153. data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
  154. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  155. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  156. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
  157. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
  158. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
  159. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
  160. data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
  161. data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
  162. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
  163. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
  164. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
  165. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
  166. data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
  167. data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
  168. data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
  169. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
  170. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  171. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
  172. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
  173. data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
  174. data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
  175. data/ext/sources/ggml/CMakeLists.txt +9 -3
  176. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  177. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  179. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  180. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  181. data/ext/sources/ggml/include/ggml-rpc.h +6 -1
  182. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  183. data/ext/sources/ggml/include/ggml.h +56 -9
  184. data/ext/sources/ggml/src/CMakeLists.txt +3 -0
  185. data/ext/sources/ggml/src/ggml-alloc.c +4 -9
  186. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  187. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  188. data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
  189. data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  191. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
  192. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  193. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  194. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
  195. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
  196. data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
  197. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
  198. data/ext/sources/ggml/src/ggml-common.h +11 -0
  199. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
  200. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
  201. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  202. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  203. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
  204. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
  205. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  208. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  209. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
  210. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
  211. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  212. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  213. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
  214. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
  215. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
  216. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  217. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
  218. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
  219. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
  220. data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
  221. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  222. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  223. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
  224. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  225. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  226. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
  227. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  228. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
  229. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
  231. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
  232. data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
  233. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
  235. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
  236. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
  237. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
  238. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
  239. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  241. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
  242. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  243. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  244. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
  245. data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
  246. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
  247. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  248. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
  249. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
  250. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
  251. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  252. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
  253. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
  254. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  255. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  256. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
  257. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  258. data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
  259. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  260. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  261. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
  262. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  269. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
  270. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
  271. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
  272. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
  274. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
  278. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  297. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  317. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  321. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
  322. data/ext/sources/ggml/src/ggml-impl.h +62 -0
  323. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  324. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  325. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  326. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  327. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
  328. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
  329. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
  330. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
  335. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
  336. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  367. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  368. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  369. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  370. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  371. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  372. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  373. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  374. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  375. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  376. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  377. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  378. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  379. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  380. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  381. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  382. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  383. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  384. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  385. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  386. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  387. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  388. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  389. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  390. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  391. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  392. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  393. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  394. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  395. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  396. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  397. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  398. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  399. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  400. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  401. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  402. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  403. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  404. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  405. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  406. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  407. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  412. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  413. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  414. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  415. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  416. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
  417. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  418. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  420. data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
  421. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
  422. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  423. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  424. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  425. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
  426. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  427. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  428. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  429. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  430. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  431. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  432. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  433. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  434. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  435. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
  436. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  437. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  438. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  439. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  440. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  441. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  442. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  443. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  444. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  445. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  446. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  447. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  448. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  449. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  450. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  451. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  452. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  453. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  454. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  455. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  456. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  457. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  458. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  459. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  460. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  461. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  462. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  463. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  464. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  465. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  466. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  467. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  468. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  469. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  470. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  471. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  472. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  473. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  474. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  475. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  476. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  477. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  478. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  479. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  480. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  481. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  482. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  483. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  484. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  485. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  486. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  487. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  488. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
  489. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  490. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  491. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  492. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  493. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  494. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  495. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  496. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  497. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  498. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  499. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  500. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  501. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  502. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  503. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  504. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  505. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  506. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  507. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  508. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  509. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  510. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  511. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  512. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  513. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  514. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  515. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  516. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  517. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  518. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  519. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  520. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  521. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  522. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  523. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  524. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  525. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  526. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  527. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  528. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  529. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  530. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  531. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  532. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
  533. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
  560. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
  561. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
  562. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  563. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  564. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  565. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  566. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  567. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  568. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
  569. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  570. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
  571. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
  572. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  573. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
  574. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  584. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  585. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  586. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
  587. data/ext/sources/ggml/src/ggml.c +167 -33
  588. data/ext/sources/ggml/src/gguf.cpp +229 -44
  589. data/ext/sources/src/whisper.cpp +6 -28
  590. data/sig/whisper.rbs +43 -2
  591. data/test/test_context_params.rb +82 -0
  592. data/test/test_token.rb +11 -0
  593. data/test/test_vad_context.rb +58 -8
  594. data/test/test_whisper.rb +20 -0
  595. data/whispercpp.gemspec +1 -1
  596. metadata +240 -28
  597. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  598. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  599. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  600. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  601. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  602. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  603. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  604. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  605. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  606. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  607. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  608. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  609. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  610. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
@@ -226,7 +226,8 @@ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
226
226
  return ADRENO_GPU_GEN::A7X;
227
227
  }
228
228
 
229
- if (strstr(device_name, "830")) {
229
+ if (strstr(device_name, "830") ||
230
+ strstr(device_name, "840")) {
230
231
  return ADRENO_GPU_GEN::A8X;
231
232
  }
232
233
 
@@ -312,7 +313,7 @@ struct ProfilingInfo {
312
313
  cl_ulong cmd_duration_ns;
313
314
  // The time for the kernel to complete - COMPLETE - END
314
315
  cl_ulong cmd_complete_duration_ns;
315
- // Total time to finish the kernel - COMPELTE - QUEUED
316
+ // Total time to finish the kernel - COMPLETE - QUEUED
316
317
  cl_ulong cmd_total_duration_ns;
317
318
  // Global and local work sizes.
318
319
  size_t global_size[3];
@@ -398,6 +399,7 @@ struct ggml_backend_opencl_context {
398
399
  int adreno_wave_size;
399
400
 
400
401
  cl_bool non_uniform_workgroups;
402
+ size_t image_max_buffer_size;
401
403
 
402
404
  cl_context context;
403
405
  cl_command_queue queue;
@@ -407,10 +409,13 @@ struct ggml_backend_opencl_context {
407
409
  ggml_cl_buffer prealloc_scales_trans;
408
410
  ggml_cl_buffer prealloc_act_trans;
409
411
 
412
+ // prealloc buffers for src0 and src1
413
+ ggml_cl_buffer prealloc_src0;
414
+ ggml_cl_buffer prealloc_src1;
415
+
410
416
  cl_program program_add;
411
417
  cl_program program_add_id;
412
418
  cl_program program_clamp;
413
- cl_program program_cpy;
414
419
  cl_program program_cvt;
415
420
  cl_program program_diag_mask_inf;
416
421
  cl_program program_gelu;
@@ -447,7 +452,6 @@ struct ggml_backend_opencl_context {
447
452
  cl_program program_rms_norm;
448
453
  cl_program program_group_norm;
449
454
  cl_program program_rope;
450
- cl_program program_scale;
451
455
  cl_program program_silu;
452
456
  cl_program program_sigmoid;
453
457
  cl_program program_softmax_f32;
@@ -456,11 +460,8 @@ struct ggml_backend_opencl_context {
456
460
  cl_program program_softmax_4_f16;
457
461
  cl_program program_argsort_f32_i32;
458
462
  cl_program program_sum_rows_f32;
459
- cl_program program_repeat;
460
463
  cl_program program_pad;
461
- cl_program program_tanh;
462
464
  cl_program program_upscale;
463
- cl_program program_concat;
464
465
  cl_program program_conv_2d_f16;
465
466
  cl_program program_conv_2d_f32;
466
467
  cl_program program_conv_2d_f16_f32;
@@ -479,24 +480,27 @@ struct ggml_backend_opencl_context {
479
480
  cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
480
481
  cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
481
482
  cl_kernel kernel_add_id;
482
- cl_kernel kernel_scale;
483
+ cl_kernel kernel_scale_f32, kernel_scale_f32_4;
483
484
  cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
484
485
  cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
485
- cl_kernel kernel_mean_f32;
486
+ cl_kernel kernel_mean_f32, kernel_mean_f32_4;
486
487
  cl_kernel kernel_silu, kernel_silu_4;
487
488
  cl_kernel kernel_gelu, kernel_gelu_4;
488
489
  cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
489
490
  cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
490
491
  cl_kernel kernel_relu;
491
492
  cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
493
+ cl_kernel kernel_tri;
492
494
  cl_kernel kernel_fill;
493
495
  cl_kernel kernel_clamp;
494
496
  cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_swiglu_oai, kernel_geglu_erf, kernel_geglu_quick,
495
497
  kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
496
498
  cl_kernel kernel_norm, kernel_norm_mul_add;
497
499
  cl_kernel kernel_rms_norm, kernel_rms_norm_mul;
500
+ cl_kernel kernel_l2_norm_f32;
498
501
  cl_kernel kernel_group_norm, kernel_group_norm_mul_add;
499
502
  cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
503
+ cl_kernel kernel_diag_f32;
500
504
  cl_kernel kernel_soft_max, kernel_soft_max_4;
501
505
  cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
502
506
  std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f16;
@@ -511,7 +515,7 @@ struct ggml_backend_opencl_context {
511
515
  cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
512
516
  cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
513
517
  cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
514
- cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
518
+ cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
515
519
  cl_kernel kernel_mul_mat_f32_f32;
516
520
  cl_kernel kernel_mul_mat_f16_f16;
517
521
  cl_kernel kernel_mul_mat_f16_f32_1row;
@@ -522,30 +526,43 @@ struct ggml_backend_opencl_context {
522
526
  cl_kernel kernel_mul_mm_f16_f32_kq;
523
527
  cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
524
528
  cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
529
+ cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1;
525
530
  cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
526
- cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
531
+ cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
527
532
  cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
528
533
  cl_kernel kernel_convert_block_q4_0_noshuffle;
529
534
  cl_kernel kernel_restore_block_q4_0_noshuffle;
535
+ cl_kernel kernel_convert_block_q4_1_noshuffle;
536
+ cl_kernel kernel_restore_block_q4_1_noshuffle;
537
+ cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
530
538
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
539
+ cl_kernel kernel_mul_mv_q4_1_f32;
540
+ cl_kernel kernel_mul_mv_q4_1_f32_flat;
541
+ cl_kernel kernel_mul_mv_q4_K_f32;
531
542
  cl_kernel kernel_mul_mv_q6_K_f32;
543
+ cl_kernel kernel_mul_mv_q6_K_f32_flat;
532
544
  cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
533
545
  cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
546
+ cl_kernel kernel_solve_tri_f32;
534
547
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
535
548
  cl_kernel kernel_argsort_f32_i32;
536
- cl_kernel kernel_sum_rows_f32;
537
- cl_kernel kernel_repeat;
549
+ cl_kernel kernel_sum_rows_f32, kernel_sum_rows_f32_4;
550
+ cl_kernel kernel_cumsum_blk, kernel_cumsum_add;
551
+ cl_kernel kernel_repeat_f32;
538
552
  cl_kernel kernel_pad;
539
- cl_kernel kernel_tanh_f32_nd;
540
- cl_kernel kernel_tanh_f16_nd;
541
- cl_kernel kernel_expm1_f32_nd;
542
- cl_kernel kernel_expm1_f16_nd;
543
- cl_kernel kernel_softplus_f32_nd;
544
- cl_kernel kernel_softplus_f16_nd;
553
+ cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
554
+ cl_kernel kernel_tanh_f16, kernel_tanh_f16_4, kernel_tanh_f16_nc;
555
+ cl_kernel kernel_neg_f32, kernel_neg_f32_4, kernel_neg_f32_nc;
556
+ cl_kernel kernel_neg_f16, kernel_neg_f16_4, kernel_neg_f16_nc;
557
+ cl_kernel kernel_exp_f32, kernel_exp_f32_4, kernel_exp_f32_nc;
558
+ cl_kernel kernel_exp_f16, kernel_exp_f16_4, kernel_exp_f16_nc;
559
+ cl_kernel kernel_expm1_f32, kernel_expm1_f32_4, kernel_expm1_f32_nc;
560
+ cl_kernel kernel_expm1_f16, kernel_expm1_f16_4, kernel_expm1_f16_nc;
561
+ cl_kernel kernel_softplus_f32, kernel_softplus_f32_4, kernel_softplus_f32_nc;
562
+ cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
545
563
  cl_kernel kernel_upscale;
546
564
  cl_kernel kernel_upscale_bilinear;
547
- cl_kernel kernel_concat_f32_contiguous;
548
- cl_kernel kernel_concat_f32_non_contiguous;
565
+ cl_kernel kernel_concat_f32;
549
566
  cl_kernel kernel_conv_2d_f16;
550
567
  cl_kernel kernel_conv_2d_f32;
551
568
  cl_kernel kernel_conv_2d_f16_f32;
@@ -558,7 +575,10 @@ struct ggml_backend_opencl_context {
558
575
  cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
559
576
  cl_kernel kernel_mul_mm_f32_f32_l4_lm;
560
577
  cl_kernel kernel_mul_mm_f16_f32_l4_lm;
578
+ cl_kernel kernel_mul_mm_q4_0_f32_l4_lm;
579
+ cl_kernel kernel_mul_mm_q4_1_f32_l4_lm;
561
580
  cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
581
+ cl_kernel kernel_mul_mm_q6_k_f32_l4_lm;
562
582
 
563
583
  std::vector<ProfilingInfo> profiling_info;
564
584
 
@@ -671,7 +691,9 @@ struct ggml_backend_opencl_context {
671
691
  cl_kernel kernel_transpose_32;
672
692
  cl_kernel kernel_transpose_32_16;
673
693
  cl_kernel kernel_transpose_16;
694
+ cl_kernel kernel_transpose_8_buf;
674
695
  cl_kernel kernel_transpose_16_buf;
696
+ cl_kernel kernel_transpose_32_buf;
675
697
  cl_kernel kernel_transpose_16_4x1;
676
698
 
677
699
  // Gemm and Gemv related programs, kernels, etc
@@ -687,6 +709,10 @@ struct ggml_backend_opencl_context {
687
709
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
688
710
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
689
711
  cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
712
+ cl_kernel kernel_gemv_noshuffle_q4_1_f32;
713
+ cl_kernel kernel_gemm_noshuffle_q4_1_f32;
714
+ cl_kernel kernel_mul_mm_q8_0_f32_8x4;
715
+ cl_kernel CL_mul_mat_vec_q8_0_f32;
690
716
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
691
717
 
692
718
  void free() {
@@ -792,6 +818,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
792
818
  GGML_LOG_CONT(".");
793
819
  }
794
820
 
821
+ // tri
822
+ {
823
+ #ifdef GGML_OPENCL_EMBED_KERNELS
824
+ const std::string kernel_src {
825
+ #include "tri.cl.h"
826
+ };
827
+ #else
828
+ const std::string kernel_src = read_file("tri.cl");
829
+ #endif
830
+ cl_program prog =
831
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
832
+
833
+ CL_CHECK((backend_ctx->kernel_tri = clCreateKernel(prog, "kernel_tri_f32", &err), err));
834
+ GGML_LOG_CONT(".");
835
+
836
+ CL_CHECK(clReleaseProgram(prog));
837
+ }
838
+
795
839
  // fill
796
840
  {
797
841
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -835,13 +879,14 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
835
879
  #else
836
880
  const std::string kernel_src = read_file("cpy.cl");
837
881
  #endif
838
- backend_ctx->program_cpy =
882
+ cl_program prog =
839
883
  build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
840
884
 
841
- CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f16", &err), err));
842
- CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f32", &err), err));
843
- CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f16", &err), err));
844
- CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f32", &err), err));
885
+ CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(prog, "kernel_cpy_f16_f16", &err), err));
886
+ CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
887
+ CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
888
+ CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
889
+ CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
845
890
  GGML_LOG_CONT(".");
846
891
  }
847
892
 
@@ -861,12 +906,19 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
861
906
  CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
862
907
  CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
863
908
  CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
909
+ CL_CHECK((backend_ctx->kernel_convert_block_q4_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1_noshuffle", &err), err));
910
+ CL_CHECK((backend_ctx->kernel_restore_block_q4_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_noshuffle", &err), err));
911
+ CL_CHECK((backend_ctx->kernel_convert_block_q4_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1", &err), err));
912
+ CL_CHECK((backend_ctx->kernel_restore_block_q4_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1", &err), err));
864
913
  CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
865
914
  CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err));
866
915
  CL_CHECK((backend_ctx->kernel_restore_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4_trans", &err), err));
867
916
  CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
868
917
  CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
869
918
  CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
919
+ CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
920
+ CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
921
+ CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
870
922
  GGML_LOG_CONT(".");
871
923
  }
872
924
 
@@ -887,6 +939,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
887
939
  GGML_LOG_CONT(".");
888
940
  }
889
941
 
942
+ // diag
943
+ {
944
+ #ifdef GGML_OPENCL_EMBED_KERNELS
945
+ const std::string kernel_src {
946
+ #include "diag.cl.h"
947
+ };
948
+ #else
949
+ const std::string kernel_src = read_file("diag.cl");
950
+ #endif
951
+ cl_program prog =
952
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
953
+
954
+ CL_CHECK((backend_ctx->kernel_diag_f32 = clCreateKernel(prog, "kernel_diag_f32", &err), err));
955
+ CL_CHECK(clReleaseProgram(prog));
956
+ GGML_LOG_CONT(".");
957
+ }
958
+
890
959
  // gelu
891
960
  {
892
961
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -952,6 +1021,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
952
1021
  GGML_LOG_CONT(".");
953
1022
  }
954
1023
 
1024
+ // solve_tri_f32
1025
+ {
1026
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1027
+ const std::string kernel_src {
1028
+ #include "solve_tri.cl.h"
1029
+ };
1030
+ #else
1031
+ const std::string kernel_src = read_file("solve_tri.cl");
1032
+ #endif
1033
+ cl_program prog =
1034
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1035
+
1036
+ CL_CHECK((backend_ctx->kernel_solve_tri_f32 = clCreateKernel(prog, "kernel_solve_tri_f32", &err), err));
1037
+ GGML_LOG_CONT(".");
1038
+ CL_CHECK(clReleaseProgram(prog));
1039
+ }
1040
+
955
1041
  // im2col_f32
956
1042
  {
957
1043
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1072,14 +1158,65 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1072
1158
  GGML_LOG_CONT(".");
1073
1159
  }
1074
1160
 
1075
- // mul_mv_q6_k
1161
+ // mul_mv_q4_1_f32
1162
+ {
1163
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1164
+ const std::string kernel_src {
1165
+ #include "mul_mv_q4_1_f32.cl.h"
1166
+ };
1167
+ #else
1168
+ const std::string kernel_src = read_file("mul_mv_q4_1_f32.cl");
1169
+ #endif
1170
+ cl_program prog =
1171
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1172
+
1173
+ CL_CHECK((backend_ctx->kernel_mul_mv_q4_1_f32 = clCreateKernel(prog, "kernel_mul_mv_q4_1_f32", &err), err));
1174
+ CL_CHECK(clReleaseProgram(prog));
1175
+ GGML_LOG_CONT(".");
1176
+ }
1177
+
1178
+ // mul_mv_q4_1_f32_flat
1179
+ {
1180
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1181
+ const std::string kernel_src {
1182
+ #include "mul_mv_q4_1_f32_flat.cl.h"
1183
+ };
1184
+ #else
1185
+ const std::string kernel_src = read_file("mul_mv_q4_1_f32_flat.cl");
1186
+ #endif
1187
+ cl_program prog =
1188
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1189
+
1190
+ CL_CHECK((backend_ctx->kernel_mul_mv_q4_1_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q4_1_f32_flat", &err), err));
1191
+ CL_CHECK(clReleaseProgram(prog));
1192
+ GGML_LOG_CONT(".");
1193
+ }
1194
+
1195
+ // mul_mv_q4_k_f32
1196
+ {
1197
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1198
+ const std::string kernel_src {
1199
+ #include "mul_mv_q4_k_f32.cl.h"
1200
+ };
1201
+ #else
1202
+ const std::string kernel_src = read_file("mul_mv_q4_k_f32.cl");
1203
+ #endif
1204
+ cl_program prog =
1205
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1206
+
1207
+ CL_CHECK((backend_ctx->kernel_mul_mv_q4_K_f32 = clCreateKernel(prog, "kernel_mul_mv_q4_K_f32", &err), err));
1208
+ CL_CHECK(clReleaseProgram(prog));
1209
+ GGML_LOG_CONT(".");
1210
+ }
1211
+
1212
+ // mul_mv_q6_k_f32
1076
1213
  {
1077
1214
  #ifdef GGML_OPENCL_EMBED_KERNELS
1078
1215
  const std::string kernel_src {
1079
- #include "mul_mv_q6_k.cl.h"
1216
+ #include "mul_mv_q6_k_f32.cl.h"
1080
1217
  };
1081
1218
  #else
1082
- const std::string kernel_src = read_file("mul_mv_q6_k.cl");
1219
+ const std::string kernel_src = read_file("mul_mv_q6_k_f32.cl");
1083
1220
  #endif
1084
1221
  backend_ctx->program_mul_mv_q6_K =
1085
1222
  build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
@@ -1088,6 +1225,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1088
1225
  GGML_LOG_CONT(".");
1089
1226
  }
1090
1227
 
1228
+ // mul_mv_q6_k_f32_flat
1229
+ {
1230
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1231
+ const std::string kernel_src {
1232
+ #include "mul_mv_q6_k_f32_flat.cl.h"
1233
+ };
1234
+ #else
1235
+ const std::string kernel_src = read_file("mul_mv_q6_k_f32_flat.cl");
1236
+ #endif
1237
+ cl_program prog =
1238
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1239
+
1240
+ CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q6_K_f32_flat", &err), err));
1241
+ CL_CHECK(clReleaseProgram(prog));
1242
+ GGML_LOG_CONT(".");
1243
+ }
1244
+
1091
1245
  // mul_mv_q8_0_f32
1092
1246
  {
1093
1247
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1280,6 +1434,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1280
1434
  GGML_LOG_CONT(".");
1281
1435
  }
1282
1436
 
1437
+ // mul_mm_q4_0_f32_l4_lm
1438
+ {
1439
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1440
+ const std::string kernel_src {
1441
+ #include "mul_mm_q4_0_f32_l4_lm.cl.h"
1442
+ };
1443
+ #else
1444
+ const std::string kernel_src = read_file("mul_mm_q4_0_f32_l4_lm.cl");
1445
+ #endif
1446
+ cl_program prog =
1447
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1448
+
1449
+ CL_CHECK((backend_ctx->kernel_mul_mm_q4_0_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q4_0_f32_l4_lm", &err), err));
1450
+ GGML_LOG_CONT(".");
1451
+ }
1452
+
1453
+ // mul_mm_q4_1_f32_l4_lm
1454
+ {
1455
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1456
+ const std::string kernel_src {
1457
+ #include "mul_mm_q4_1_f32_l4_lm.cl.h"
1458
+ };
1459
+ #else
1460
+ const std::string kernel_src = read_file("mul_mm_q4_1_f32_l4_lm.cl");
1461
+ #endif
1462
+ cl_program prog =
1463
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1464
+
1465
+ CL_CHECK((backend_ctx->kernel_mul_mm_q4_1_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q4_1_f32_l4_lm", &err), err));
1466
+ GGML_LOG_CONT(".");
1467
+ }
1468
+
1283
1469
  // mul_mm_q8_0_f32_l4_lm
1284
1470
  {
1285
1471
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1296,6 +1482,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1296
1482
  GGML_LOG_CONT(".");
1297
1483
  }
1298
1484
 
1485
+ // mul_mm_q6_k_f32_l4_lm
1486
+ {
1487
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1488
+ const std::string kernel_src {
1489
+ #include "mul_mm_q6_k_f32_l4_lm.cl.h"
1490
+ };
1491
+ #else
1492
+ const std::string kernel_src = read_file("mul_mm_q6_k_f32_l4_lm.cl");
1493
+ #endif
1494
+ cl_program prog =
1495
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1496
+
1497
+ CL_CHECK((backend_ctx->kernel_mul_mm_q6_k_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q6_k_f32_l4_lm", &err), err));
1498
+ CL_CHECK(clReleaseProgram(prog));
1499
+ GGML_LOG_CONT(".");
1500
+ }
1501
+
1299
1502
  // mul_mm_f16_f32_kq_kqv
1300
1503
  {
1301
1504
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1384,6 +1587,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1384
1587
  GGML_LOG_CONT(".");
1385
1588
  }
1386
1589
 
1590
+ // l2_norm
1591
+ {
1592
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1593
+ const std::string kernel_src {
1594
+ #include "l2_norm.cl.h"
1595
+ };
1596
+ #else
1597
+ const std::string kernel_src = read_file("l2_norm.cl");
1598
+ #endif
1599
+ cl_program prog =
1600
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1601
+
1602
+ CL_CHECK((backend_ctx->kernel_l2_norm_f32 = clCreateKernel(prog, "kernel_l2_norm_f32", &err), err));
1603
+ CL_CHECK(clReleaseProgram(prog));
1604
+ GGML_LOG_CONT(".");
1605
+ }
1606
+
1387
1607
  // rope
1388
1608
  {
1389
1609
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1416,10 +1636,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1416
1636
  #else
1417
1637
  const std::string kernel_src = read_file("scale.cl");
1418
1638
  #endif
1419
- backend_ctx->program_scale =
1639
+ cl_program prog =
1420
1640
  build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1421
1641
 
1422
- CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program_scale, "kernel_scale", &err), err));
1642
+ CL_CHECK((backend_ctx->kernel_scale_f32 = clCreateKernel(prog, "kernel_scale_f32", &err), err));
1643
+ CL_CHECK((backend_ctx->kernel_scale_f32_4 = clCreateKernel(prog, "kernel_scale_f32_4", &err), err));
1644
+ CL_CHECK(clReleaseProgram(prog));
1423
1645
  GGML_LOG_CONT(".");
1424
1646
  }
1425
1647
 
@@ -1664,6 +1886,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1664
1886
  build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1665
1887
 
1666
1888
  CL_CHECK((backend_ctx->kernel_mean_f32 = clCreateKernel(prog, "kernel_mean_f32", &err), err));
1889
+ CL_CHECK((backend_ctx->kernel_mean_f32_4 = clCreateKernel(prog, "kernel_mean_f32_4", &err), err));
1667
1890
 
1668
1891
  CL_CHECK(clReleaseProgram(prog));
1669
1892
  GGML_LOG_CONT(".");
@@ -1701,7 +1924,26 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1701
1924
  build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1702
1925
 
1703
1926
  CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
1927
+ CL_CHECK((backend_ctx->kernel_sum_rows_f32_4 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32_4", &err), err));
1928
+ GGML_LOG_CONT(".");
1929
+ }
1930
+
1931
+ // cumsum
1932
+ {
1933
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1934
+ const std::string kernel_src {
1935
+ #include "cumsum.cl.h"
1936
+ };
1937
+ #else
1938
+ const std::string kernel_src = read_file("cumsum.cl");
1939
+ #endif
1940
+ cl_program prog;
1941
+ prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1942
+
1943
+ CL_CHECK((backend_ctx->kernel_cumsum_blk = clCreateKernel(prog, "kernel_cumsum_blk", &err), err));
1944
+ CL_CHECK((backend_ctx->kernel_cumsum_add = clCreateKernel(prog, "kernel_cumsum_add", &err), err));
1704
1945
  GGML_LOG_CONT(".");
1946
+ CL_CHECK(clReleaseProgram(prog));
1705
1947
  }
1706
1948
 
1707
1949
  // sigmoid
@@ -1747,16 +1989,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1747
1989
  #else
1748
1990
  const std::string kernel_src = read_file("repeat.cl");
1749
1991
  #endif
1750
- if (!kernel_src.empty()) {
1751
- backend_ctx->program_repeat =
1752
- build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1753
- CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
1754
- GGML_LOG_CONT(".");
1755
- } else {
1756
- GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
1757
- backend_ctx->program_repeat = nullptr;
1758
- backend_ctx->kernel_repeat = nullptr;
1759
- }
1992
+ cl_program prog =
1993
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1994
+ CL_CHECK((backend_ctx->kernel_repeat_f32 = clCreateKernel(prog, "kernel_repeat_f32", &err), err));
1995
+ CL_CHECK(clReleaseProgram(prog));
1996
+ GGML_LOG_CONT(".");
1760
1997
  }
1761
1998
 
1762
1999
  // pad
@@ -1789,18 +2026,58 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1789
2026
  #else
1790
2027
  const std::string kernel_src = read_file("tanh.cl");
1791
2028
  #endif
1792
- if (!kernel_src.empty()) {
1793
- backend_ctx->program_tanh =
1794
- build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1795
- CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
1796
- CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
1797
- GGML_LOG_CONT(".");
1798
- } else {
1799
- GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
1800
- backend_ctx->program_tanh = nullptr;
1801
- backend_ctx->kernel_tanh_f32_nd = nullptr;
1802
- backend_ctx->kernel_tanh_f16_nd = nullptr;
1803
- }
2029
+ cl_program prog =
2030
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2031
+ CL_CHECK((backend_ctx->kernel_tanh_f32 = clCreateKernel(prog, "kernel_tanh_f32", &err), err));
2032
+ CL_CHECK((backend_ctx->kernel_tanh_f32_4 = clCreateKernel(prog, "kernel_tanh_f32_4", &err), err));
2033
+ CL_CHECK((backend_ctx->kernel_tanh_f32_nc = clCreateKernel(prog, "kernel_tanh_f32_nc", &err), err));
2034
+ CL_CHECK((backend_ctx->kernel_tanh_f16 = clCreateKernel(prog, "kernel_tanh_f16", &err), err));
2035
+ CL_CHECK((backend_ctx->kernel_tanh_f16_4 = clCreateKernel(prog, "kernel_tanh_f16_4", &err), err));
2036
+ CL_CHECK((backend_ctx->kernel_tanh_f16_nc = clCreateKernel(prog, "kernel_tanh_f16_nc", &err), err));
2037
+ CL_CHECK(clReleaseProgram(prog));
2038
+ GGML_LOG_CONT(".");
2039
+ }
2040
+
2041
+ // neg
2042
+ {
2043
+ #ifdef GGML_OPENCL_EMBED_KERNELS
2044
+ const std::string kernel_src {
2045
+ #include "neg.cl.h"
2046
+ };
2047
+ #else
2048
+ const std::string kernel_src = read_file("neg.cl");
2049
+ #endif
2050
+ cl_program prog =
2051
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2052
+ CL_CHECK((backend_ctx->kernel_neg_f32 = clCreateKernel(prog, "kernel_neg_f32", &err), err));
2053
+ CL_CHECK((backend_ctx->kernel_neg_f32_4 = clCreateKernel(prog, "kernel_neg_f32_4", &err), err));
2054
+ CL_CHECK((backend_ctx->kernel_neg_f32_nc = clCreateKernel(prog, "kernel_neg_f32_nc", &err), err));
2055
+ CL_CHECK((backend_ctx->kernel_neg_f16 = clCreateKernel(prog, "kernel_neg_f16", &err), err));
2056
+ CL_CHECK((backend_ctx->kernel_neg_f16_4 = clCreateKernel(prog, "kernel_neg_f16_4", &err), err));
2057
+ CL_CHECK((backend_ctx->kernel_neg_f16_nc = clCreateKernel(prog, "kernel_neg_f16_nc", &err), err));
2058
+ CL_CHECK(clReleaseProgram(prog));
2059
+ GGML_LOG_CONT(".");
2060
+ }
2061
+
2062
+ // exp
2063
+ {
2064
+ #ifdef GGML_OPENCL_EMBED_KERNELS
2065
+ const std::string kernel_src {
2066
+ #include "exp.cl.h"
2067
+ };
2068
+ #else
2069
+ const std::string kernel_src = read_file("exp.cl");
2070
+ #endif
2071
+ cl_program prog =
2072
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2073
+ CL_CHECK((backend_ctx->kernel_exp_f32 = clCreateKernel(prog, "kernel_exp_f32", &err), err));
2074
+ CL_CHECK((backend_ctx->kernel_exp_f32_4 = clCreateKernel(prog, "kernel_exp_f32_4", &err), err));
2075
+ CL_CHECK((backend_ctx->kernel_exp_f32_nc = clCreateKernel(prog, "kernel_exp_f32_nc", &err), err));
2076
+ CL_CHECK((backend_ctx->kernel_exp_f16 = clCreateKernel(prog, "kernel_exp_f16", &err), err));
2077
+ CL_CHECK((backend_ctx->kernel_exp_f16_4 = clCreateKernel(prog, "kernel_exp_f16_4", &err), err));
2078
+ CL_CHECK((backend_ctx->kernel_exp_f16_nc = clCreateKernel(prog, "kernel_exp_f16_nc", &err), err));
2079
+ CL_CHECK(clReleaseProgram(prog));
2080
+ GGML_LOG_CONT(".");
1804
2081
  }
1805
2082
 
1806
2083
  // expm1
@@ -1812,20 +2089,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1812
2089
  #else
1813
2090
  const std::string kernel_src = read_file("expm1.cl");
1814
2091
  #endif
1815
- cl_program prog;
1816
- if (!kernel_src.empty()) {
1817
- prog =
1818
- build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1819
- CL_CHECK((backend_ctx->kernel_expm1_f32_nd = clCreateKernel(prog, "kernel_expm1_f32_nd", &err), err));
1820
- CL_CHECK((backend_ctx->kernel_expm1_f16_nd = clCreateKernel(prog, "kernel_expm1_f16_nd", &err), err));
1821
- GGML_LOG_CONT(".");
1822
- } else {
1823
- GGML_LOG_WARN("ggml_opencl: expm1 kernel source not found or empty. Expm1 operation will not be available.\n");
1824
- prog = nullptr;
1825
- backend_ctx->kernel_expm1_f32_nd = nullptr;
1826
- backend_ctx->kernel_expm1_f16_nd = nullptr;
1827
- }
2092
+ cl_program prog =
2093
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2094
+ CL_CHECK((backend_ctx->kernel_expm1_f32 = clCreateKernel(prog, "kernel_expm1_f32", &err), err));
2095
+ CL_CHECK((backend_ctx->kernel_expm1_f32_4 = clCreateKernel(prog, "kernel_expm1_f32_4", &err), err));
2096
+ CL_CHECK((backend_ctx->kernel_expm1_f32_nc = clCreateKernel(prog, "kernel_expm1_f32_nc", &err), err));
2097
+ CL_CHECK((backend_ctx->kernel_expm1_f16 = clCreateKernel(prog, "kernel_expm1_f16", &err), err));
2098
+ CL_CHECK((backend_ctx->kernel_expm1_f16_4 = clCreateKernel(prog, "kernel_expm1_f16_4", &err), err));
2099
+ CL_CHECK((backend_ctx->kernel_expm1_f16_nc = clCreateKernel(prog, "kernel_expm1_f16_nc", &err), err));
1828
2100
  CL_CHECK(clReleaseProgram(prog));
2101
+ GGML_LOG_CONT(".");
1829
2102
  }
1830
2103
 
1831
2104
  // softplus
@@ -1837,20 +2110,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1837
2110
  #else
1838
2111
  const std::string kernel_src = read_file("softplus.cl");
1839
2112
  #endif
1840
- cl_program prog;
1841
- if (!kernel_src.empty()) {
1842
- prog =
1843
- build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1844
- CL_CHECK((backend_ctx->kernel_softplus_f32_nd = clCreateKernel(prog, "kernel_softplus_f32_nd", &err), err));
1845
- CL_CHECK((backend_ctx->kernel_softplus_f16_nd = clCreateKernel(prog, "kernel_softplus_f16_nd", &err), err));
1846
- GGML_LOG_CONT(".");
1847
- } else {
1848
- GGML_LOG_WARN("ggml_opencl: softplus kernel source not found or empty. Softplus operation will not be available.\n");
1849
- prog = nullptr;
1850
- backend_ctx->kernel_softplus_f32_nd = nullptr;
1851
- backend_ctx->kernel_softplus_f16_nd = nullptr;
1852
- }
2113
+ cl_program prog =
2114
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2115
+ CL_CHECK((backend_ctx->kernel_softplus_f32 = clCreateKernel(prog, "kernel_softplus_f32", &err), err));
2116
+ CL_CHECK((backend_ctx->kernel_softplus_f32_4 = clCreateKernel(prog, "kernel_softplus_f32_4", &err), err));
2117
+ CL_CHECK((backend_ctx->kernel_softplus_f32_nc = clCreateKernel(prog, "kernel_softplus_f32_nc", &err), err));
2118
+ CL_CHECK((backend_ctx->kernel_softplus_f16 = clCreateKernel(prog, "kernel_softplus_f16", &err), err));
2119
+ CL_CHECK((backend_ctx->kernel_softplus_f16_4 = clCreateKernel(prog, "kernel_softplus_f16_4", &err), err));
2120
+ CL_CHECK((backend_ctx->kernel_softplus_f16_nc = clCreateKernel(prog, "kernel_softplus_f16_nc", &err), err));
1853
2121
  CL_CHECK(clReleaseProgram(prog));
2122
+ GGML_LOG_CONT(".");
1854
2123
  }
1855
2124
 
1856
2125
  // upscale
@@ -1892,22 +2161,13 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1892
2161
  #include "concat.cl.h"
1893
2162
  };
1894
2163
  #else
1895
-
1896
2164
  const std::string kernel_src = read_file("concat.cl");
1897
2165
  #endif
1898
- if (!kernel_src.empty()) {
1899
- backend_ctx->program_concat =
1900
- build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1901
-
1902
- CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
1903
- CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
1904
- GGML_LOG_CONT(".");
1905
- } else {
1906
- GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
1907
- backend_ctx->program_concat = nullptr;
1908
- backend_ctx->kernel_concat_f32_contiguous = nullptr;
1909
- backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
1910
- }
2166
+ cl_program prog =
2167
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2168
+ CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
2169
+ CL_CHECK(clReleaseProgram(prog));
2170
+ GGML_LOG_CONT(".");
1911
2171
  }
1912
2172
 
1913
2173
  // timestep_embedding
@@ -2107,7 +2367,9 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
2107
2367
  CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
2108
2368
  CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
2109
2369
  CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
2370
+ CL_CHECK((backend_ctx->kernel_transpose_8_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_8_buf", &err), err));
2110
2371
  CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
2372
+ CL_CHECK((backend_ctx->kernel_transpose_32_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_buf", &err), err));
2111
2373
  CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
2112
2374
  GGML_LOG_CONT(".");
2113
2375
  }
@@ -2227,42 +2489,121 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
2227
2489
  GGML_LOG_CONT(".");
2228
2490
  }
2229
2491
 
2230
- std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
2231
- " -cl-mad-enable "
2232
- " -cl-fast-relaxed-math";
2233
-
2234
- // gemv_moe_mxfp4_f32
2492
+ // gemm_noshuffle_q4_1_f32
2235
2493
  {
2236
2494
  #ifdef GGML_OPENCL_EMBED_KERNELS
2237
2495
  const std::string kernel_src {
2238
- #include "gemv_moe_mxfp4_f32.cl.h"
2239
- };
2496
+ #include "gemm_noshuffle_q4_1_f32.cl.h"
2497
+ };
2240
2498
  #else
2241
- const std::string kernel_src = read_file("gemv_moe_mxfp4_f32.cl");
2499
+ const std::string kernel_src = read_file("gemm_noshuffle_q4_1_f32.cl");
2242
2500
  #endif
2243
- backend_ctx->program_gemv_moe_mxfp4_f32 =
2244
- build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
2245
-
2246
- CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemv_moe_mxfp4_f32, "kernel_gemv_moe_mxfp4_f32", &err), err));
2501
+ cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2502
+ CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q4_1_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q4_1_f32", &err), err));
2503
+ CL_CHECK(clReleaseProgram(prog));
2247
2504
  GGML_LOG_CONT(".");
2248
2505
  }
2249
2506
 
2250
- // gemm_moe_mxfp4_f32
2507
+ // gemv_noshuffle_q4_1_f32
2251
2508
  {
2509
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
2510
+ " -cl-mad-enable ";
2511
+ if (backend_ctx->has_vector_subgroup_broadcast) {
2512
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
2513
+ }
2514
+
2252
2515
  #ifdef GGML_OPENCL_EMBED_KERNELS
2253
2516
  const std::string kernel_src {
2254
- #include "gemm_moe_mxfp4_f32.cl.h"
2517
+ #include "gemv_noshuffle_q4_1_f32.cl.h"
2255
2518
  };
2256
2519
  #else
2257
- const std::string kernel_src = read_file("gemm_moe_mxfp4_f32.cl");
2520
+ const std::string kernel_src = read_file("gemv_noshuffle_q4_1_f32.cl");
2258
2521
  #endif
2259
- backend_ctx->program_gemm_moe_mxfp4_f32 =
2260
- build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
2261
2522
 
2262
- CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemm_moe_mxfp4_f32, "kernel_gemm_moe_mxfp4_f32", &err), err));
2523
+ cl_program prog = build_program_from_source(
2524
+ backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
2525
+
2526
+ CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_1_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_1_f32", &err), err));
2527
+ CL_CHECK(clReleaseProgram(prog));
2263
2528
  GGML_LOG_CONT(".");
2264
2529
  }
2265
- #endif // GGML_OPENCL_USE_ADRENO_KERNELS
2530
+
2531
+ // mul_mm_q8_0_f32_8x4
2532
+ {
2533
+ #ifdef GGML_OPENCL_EMBED_KERNELS
2534
+ const std::string kernel_src_q8_8x4_gemm {
2535
+ #include "mul_mm_q8_0_f32_8x4.cl.h"
2536
+ };
2537
+ #else
2538
+ const std::string kernel_src_q8_8x4_gemm = read_file("mul_mm_q8_0_f32_8x4.cl");
2539
+ #endif
2540
+ backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_q8_8x4_gemm.c_str(), compile_opts);
2541
+ CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mm_q8_0_f32_8x4", &err), err));
2542
+ GGML_LOG_CONT(".");
2543
+ }
2544
+
2545
+ // gemv_noshuffle_general_q8_0_f32
2546
+ {
2547
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
2548
+ " -cl-mad-enable "
2549
+ " -DSIMDGROUP_WIDTH=" +
2550
+ std::to_string(backend_ctx->adreno_wave_size);
2551
+ if (backend_ctx->has_vector_subgroup_broadcast) {
2552
+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
2553
+ }
2554
+
2555
+ #ifdef GGML_OPENCL_EMBED_KERNELS
2556
+ const std::string kernel_src_CL_gemv_general {
2557
+ #include "gemv_noshuffle_general_q8_0_f32.cl.h"
2558
+ };
2559
+ #else
2560
+ const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general_q8_0_f32.cl");
2561
+ #endif
2562
+
2563
+ cl_program prog = build_program_from_source(
2564
+ backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
2565
+
2566
+ CL_CHECK((backend_ctx->CL_mul_mat_vec_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q8_0_f32", &err), err));
2567
+ CL_CHECK(clReleaseProgram(prog));
2568
+ GGML_LOG_CONT(".");
2569
+ }
2570
+
2571
+ std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
2572
+ " -cl-mad-enable "
2573
+ " -cl-fast-relaxed-math";
2574
+
2575
+ // gemv_moe_mxfp4_f32
2576
+ {
2577
+ #ifdef GGML_OPENCL_EMBED_KERNELS
2578
+ const std::string kernel_src {
2579
+ #include "gemv_moe_mxfp4_f32.cl.h"
2580
+ };
2581
+ #else
2582
+ const std::string kernel_src = read_file("gemv_moe_mxfp4_f32.cl");
2583
+ #endif
2584
+ backend_ctx->program_gemv_moe_mxfp4_f32 =
2585
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
2586
+
2587
+ CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemv_moe_mxfp4_f32, "kernel_gemv_moe_mxfp4_f32", &err), err));
2588
+ GGML_LOG_CONT(".");
2589
+ }
2590
+
2591
+ // gemm_moe_mxfp4_f32
2592
+ {
2593
+ #ifdef GGML_OPENCL_EMBED_KERNELS
2594
+ const std::string kernel_src {
2595
+ #include "gemm_moe_mxfp4_f32.cl.h"
2596
+ };
2597
+ #else
2598
+ const std::string kernel_src = read_file("gemm_moe_mxfp4_f32.cl");
2599
+ #endif
2600
+ backend_ctx->program_gemm_moe_mxfp4_f32 =
2601
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
2602
+
2603
+ CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemm_moe_mxfp4_f32, "kernel_gemm_moe_mxfp4_f32", &err), err));
2604
+ GGML_LOG_CONT(".");
2605
+ }
2606
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
2266
2607
  GGML_LOG_CONT("\n");
2267
2608
  }
2268
2609
 
@@ -2315,7 +2656,7 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
2315
2656
 
2316
2657
  cl_platform_id platform_ids[NPLAT];
2317
2658
  if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
2318
- GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
2659
+ GGML_LOG_ERROR("ggml_opencl: platform IDs not available.\n");
2319
2660
  return found_devices;
2320
2661
  }
2321
2662
 
@@ -2621,6 +2962,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
2621
2962
  clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
2622
2963
  GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
2623
2964
 
2965
+ clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
2966
+ GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
2967
+
2624
2968
  clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
2625
2969
  GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
2626
2970
 
@@ -2729,6 +3073,82 @@ static void ggml_cl2_free(ggml_backend_t backend) {
2729
3073
  }
2730
3074
  }
2731
3075
 
3076
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3077
+ static void transpose_2d(
3078
+ ggml_backend_opencl_context * backend_ctx,
3079
+ cl_kernel kernel,
3080
+ cl_mem src, cl_mem dst, size_t size,
3081
+ cl_int stride, cl_int rows,
3082
+ bool blocking = true
3083
+ ) {
3084
+ static ggml_cl_buffer buf;
3085
+
3086
+ cl_event evt;
3087
+ cl_int err;
3088
+
3089
+ buf.allocate(backend_ctx->context, size);
3090
+
3091
+ cl_mem trans;
3092
+ cl_buffer_region region;
3093
+
3094
+ region.origin = 0;
3095
+ region.size = size;
3096
+ CL_CHECK((trans = clCreateSubBuffer(
3097
+ buf.buffer, CL_MEM_READ_WRITE,
3098
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
3099
+
3100
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &src));
3101
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &trans));
3102
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &stride));
3103
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &rows));
3104
+
3105
+ size_t local_size[3] = {64, 1, 1};
3106
+ size_t global_size[3] = {(size_t)stride, (size_t)rows, 1};;
3107
+ CL_CHECK(clEnqueueNDRangeKernel(backend_ctx->queue, kernel, 3, NULL,
3108
+ global_size, local_size, 0, NULL, NULL));
3109
+
3110
+ if (blocking) {
3111
+ CL_CHECK(clEnqueueCopyBuffer(backend_ctx->queue, trans, dst, 0, 0, size, 0, NULL, &evt));
3112
+ CL_CHECK(clWaitForEvents(1, &evt));
3113
+ CL_CHECK(clReleaseEvent(evt));
3114
+ } else {
3115
+ CL_CHECK(clEnqueueCopyBuffer(backend_ctx->queue, trans, dst, 0, 0, size, 0, NULL, NULL));
3116
+ }
3117
+
3118
+ CL_CHECK(clReleaseMemObject(trans));
3119
+ }
3120
+
3121
+ static void transpose_2d_as_8b(
3122
+ ggml_backend_opencl_context * backend_ctx,
3123
+ cl_mem src, cl_mem dst, size_t size,
3124
+ cl_int stride, cl_int rows,
3125
+ bool blocking = true
3126
+ ) {
3127
+ transpose_2d(backend_ctx, backend_ctx->kernel_transpose_8_buf,
3128
+ src, dst, size, stride, rows, blocking);
3129
+ }
3130
+
3131
+ static void transpose_2d_as_16b(
3132
+ ggml_backend_opencl_context * backend_ctx,
3133
+ cl_mem src, cl_mem dst, size_t size,
3134
+ cl_int stride, cl_int rows,
3135
+ bool blocking = true
3136
+ ) {
3137
+ transpose_2d(backend_ctx, backend_ctx->kernel_transpose_16_buf,
3138
+ src, dst, size, stride, rows, blocking);
3139
+ }
3140
+
3141
+ static void transpose_2d_as_32b(
3142
+ ggml_backend_opencl_context * backend_ctx,
3143
+ cl_mem src, cl_mem dst, size_t size,
3144
+ cl_int stride, cl_int rows,
3145
+ bool blocking = true
3146
+ ) {
3147
+ transpose_2d(backend_ctx, backend_ctx->kernel_transpose_32_buf,
3148
+ src, dst, size, stride, rows, blocking);
3149
+ }
3150
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
3151
+
2732
3152
  //------------------------------------------------------------------------------
2733
3153
  // Tensor extra management
2734
3154
  //------------------------------------------------------------------------------
@@ -2796,6 +3216,59 @@ struct ggml_tensor_extra_cl_q4_0 {
2796
3216
  }
2797
3217
  };
2798
3218
 
3219
+ struct ggml_tensor_extra_cl_q4_1 {
3220
+ // Quantized values.
3221
+ cl_mem q = nullptr;
3222
+ // Quantized values in image1d_buffer_t.
3223
+ cl_mem q_img = nullptr;
3224
+ // Scales.
3225
+ cl_mem d = nullptr;
3226
+ // Scales in image1d_buffer_t.
3227
+ cl_mem d_img = nullptr;
3228
+ // Min
3229
+ cl_mem m = nullptr;
3230
+ // Min in image1d_buffer_t.
3231
+ cl_mem m_img = nullptr;
3232
+ // Size of quantized values.
3233
+ size_t size_q = 0;
3234
+ // Size of scales.
3235
+ size_t size_d = 0;
3236
+ // Size of min values.
3237
+ size_t size_m = 0;
3238
+
3239
+ ~ggml_tensor_extra_cl_q4_1() {
3240
+ reset();
3241
+ }
3242
+
3243
+ void reset() {
3244
+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
3245
+ // They must be properly released so that the original buffer can be
3246
+ // properly released to avoid memory leak.
3247
+ if (q != nullptr) {
3248
+ CL_CHECK(clReleaseMemObject(q));
3249
+ q = nullptr;
3250
+ }
3251
+ if (d != nullptr) {
3252
+ CL_CHECK(clReleaseMemObject(d));
3253
+ d = nullptr;
3254
+ }
3255
+ if (m != nullptr) {
3256
+ CL_CHECK(clReleaseMemObject(m));
3257
+ m = nullptr;
3258
+ }
3259
+ // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
3260
+ // enabled. They point to the images in ggml_backend_opencl_buffer_context.
3261
+ // So, there is no need to release them here.
3262
+ // TODO: initialize them for non SMALL_PATH path, or remove them.
3263
+ q_img = nullptr;
3264
+ d_img = nullptr;
3265
+ m_img = nullptr;
3266
+ size_q = 0;
3267
+ size_d = 0;
3268
+ size_m = 0;
3269
+ }
3270
+ };
3271
+
2799
3272
  struct ggml_tensor_extra_cl_mxfp4 {
2800
3273
  // Quantized values.
2801
3274
  cl_mem q = nullptr;
@@ -2874,6 +3347,50 @@ struct ggml_tensor_extra_cl_q8_0 {
2874
3347
  }
2875
3348
  };
2876
3349
 
3350
+ struct ggml_tensor_extra_cl_q6_K {
3351
+ // Lower 4 bits of quantized weights.
3352
+ cl_mem ql = nullptr;
3353
+ // Upper 2 bits of quantized weights.
3354
+ cl_mem qh = nullptr;
3355
+ // Scales for each block.
3356
+ cl_mem s = nullptr;
3357
+ // Scales for each super block.
3358
+ cl_mem d = nullptr;
3359
+
3360
+ size_t size_ql = 0;
3361
+ size_t size_qh = 0;
3362
+ size_t size_s = 0;
3363
+ size_t size_d = 0;
3364
+
3365
+ ~ggml_tensor_extra_cl_q6_K() {
3366
+ reset();
3367
+ }
3368
+
3369
+ void reset() {
3370
+ if (ql != nullptr) {
3371
+ CL_CHECK(clReleaseMemObject(ql));
3372
+ ql = nullptr;
3373
+ }
3374
+ if (qh != nullptr) {
3375
+ CL_CHECK(clReleaseMemObject(qh));
3376
+ qh = nullptr;
3377
+ }
3378
+ if (s != nullptr) {
3379
+ CL_CHECK(clReleaseMemObject(s));
3380
+ s = nullptr;
3381
+ }
3382
+ if (d != nullptr) {
3383
+ CL_CHECK(clReleaseMemObject(d));
3384
+ d = nullptr;
3385
+ }
3386
+
3387
+ size_ql = 0;
3388
+ size_qh = 0;
3389
+ size_s = 0;
3390
+ size_d = 0;
3391
+ }
3392
+ };
3393
+
2877
3394
  //------------------------------------------------------------------------------
2878
3395
  // Backend API
2879
3396
  //------------------------------------------------------------------------------
@@ -2923,7 +3440,7 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
2923
3440
  CL_CHECK(clReleaseEvent(evt));
2924
3441
  }
2925
3442
 
2926
- // Syncronizes the 'backend_ctx's device with others so that commands
3443
+ // Synchronizes the 'backend_ctx's device with others so that commands
2927
3444
  // enqueued to it won't start until commands in the other devices have
2928
3445
  // completed.
2929
3446
  static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
@@ -3040,6 +3557,10 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
3040
3557
  continue;
3041
3558
  }
3042
3559
 
3560
+ if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
3561
+ continue;
3562
+ }
3563
+
3043
3564
  if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
3044
3565
  ggml_opencl_op_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
3045
3566
  i += 2;
@@ -3124,9 +3645,21 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
3124
3645
  default:
3125
3646
  return false;
3126
3647
  }
3648
+ case GGML_TYPE_I32:
3649
+ switch (op->type) {
3650
+ case GGML_TYPE_I32:
3651
+ return true;
3652
+ default:
3653
+ return false;
3654
+ }
3127
3655
  default:
3128
3656
  return false;
3129
3657
  }
3658
+ case GGML_OP_SET: {
3659
+ return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32) &&
3660
+ op->type == op->src[0]->type &&
3661
+ op->type == op->src[1]->type;
3662
+ }
3130
3663
  case GGML_OP_SCALE:
3131
3664
  return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
3132
3665
  case GGML_OP_ADD:
@@ -3160,14 +3693,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
3160
3693
  case GGML_UNARY_OP_SIGMOID:
3161
3694
  return ggml_is_contiguous(op->src[0]);
3162
3695
  case GGML_UNARY_OP_TANH:
3163
- return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
3164
- (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
3696
+ case GGML_UNARY_OP_NEG:
3697
+ case GGML_UNARY_OP_EXP:
3698
+ return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
3165
3699
  case GGML_UNARY_OP_EXPM1:
3166
- return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
3167
- (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
3700
+ return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
3168
3701
  case GGML_UNARY_OP_SOFTPLUS:
3169
- return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
3170
- (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
3702
+ return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
3171
3703
  default:
3172
3704
  return false;
3173
3705
  }
@@ -3183,6 +3715,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
3183
3715
  default:
3184
3716
  return false;
3185
3717
  }
3718
+ case GGML_OP_TRI:
3719
+ return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
3186
3720
  case GGML_OP_FILL:
3187
3721
  return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
3188
3722
  case GGML_OP_CLAMP:
@@ -3192,6 +3726,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
3192
3726
  return true;
3193
3727
  case GGML_OP_RMS_NORM:
3194
3728
  return op->ne[0] % 4 == 0 && ggml_is_contiguous_rows(op->src[0]);
3729
+ case GGML_OP_L2_NORM:
3730
+ return ggml_is_contiguous_rows(op->src[0]);
3195
3731
  case GGML_OP_REPEAT:
3196
3732
  return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
3197
3733
  case GGML_OP_PAD:
@@ -3223,7 +3759,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
3223
3759
  return true;
3224
3760
  } else if (op->src[0]->type == GGML_TYPE_F32) {
3225
3761
  return op->src[1]->type == GGML_TYPE_F32;
3226
- } else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_MXFP4 ||
3762
+ } else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q4_1 ||
3763
+ op->src[0]->type == GGML_TYPE_MXFP4 ||
3764
+ op->src[0]->type == GGML_TYPE_Q4_K ||
3227
3765
  op->src[0]->type == GGML_TYPE_Q6_K) {
3228
3766
  return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
3229
3767
  } else if (op->src[0]->type == GGML_TYPE_Q8_0) {
@@ -3244,6 +3782,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
3244
3782
  case GGML_OP_PERMUTE:
3245
3783
  case GGML_OP_TRANSPOSE:
3246
3784
  return true;
3785
+ case GGML_OP_DIAG:
3786
+ return true;
3247
3787
  case GGML_OP_DIAG_MASK_INF:
3248
3788
  return op->ne[3] == 1;
3249
3789
  case GGML_OP_ROPE: {
@@ -3266,6 +3806,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
3266
3806
  }
3267
3807
  return true;
3268
3808
  }
3809
+ case GGML_OP_SOLVE_TRI:
3810
+ return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
3269
3811
  case GGML_OP_IM2COL:
3270
3812
  return true;
3271
3813
  case GGML_OP_ARGSORT: {
@@ -3280,8 +3822,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
3280
3822
  return cols <= max_workgroup_size && op->src[0]->type == GGML_TYPE_F32;
3281
3823
  }
3282
3824
  case GGML_OP_SUM_ROWS:
3283
- case GGML_OP_MEAN:
3825
+ case GGML_OP_CUMSUM:
3284
3826
  return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
3827
+ case GGML_OP_MEAN:
3828
+ return op->src[0]->type == GGML_TYPE_F32;
3285
3829
  case GGML_OP_FLASH_ATTN_EXT:
3286
3830
  {
3287
3831
  const ggml_tensor * q = op->src[0];
@@ -3412,6 +3956,12 @@ struct ggml_backend_opencl_buffer_context {
3412
3956
  for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3413
3957
  delete e;
3414
3958
  }
3959
+ for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K) {
3960
+ delete e;
3961
+ }
3962
+ for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
3963
+ delete e;
3964
+ }
3415
3965
  }
3416
3966
 
3417
3967
  ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
@@ -3444,6 +3994,21 @@ struct ggml_backend_opencl_buffer_context {
3444
3994
  return extra;
3445
3995
  }
3446
3996
 
3997
+ ggml_tensor_extra_cl_q4_1 * ggml_opencl_alloc_temp_tensor_extra_q4_1() {
3998
+ ggml_tensor_extra_cl_q4_1 * extra;
3999
+ if (temp_tensor_extras_q4_1.empty()) {
4000
+ extra = new ggml_tensor_extra_cl_q4_1();
4001
+ } else {
4002
+ extra = temp_tensor_extras_q4_1.back();
4003
+ temp_tensor_extras_q4_1.pop_back();
4004
+ }
4005
+
4006
+ temp_tensor_extras_q4_1_in_use.push_back(extra);
4007
+
4008
+ extra->reset();
4009
+ return extra;
4010
+ }
4011
+
3447
4012
  ggml_tensor_extra_cl_mxfp4 * ggml_opencl_alloc_temp_tensor_extra_mxfp4() {
3448
4013
  ggml_tensor_extra_cl_mxfp4 * extra;
3449
4014
  if (temp_tensor_extras_mxfp4.empty()) {
@@ -3474,6 +4039,21 @@ struct ggml_backend_opencl_buffer_context {
3474
4039
  return extra;
3475
4040
  }
3476
4041
 
4042
+ ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() {
4043
+ ggml_tensor_extra_cl_q6_K * extra;
4044
+ if (temp_tensor_extras_q6_K.empty()) {
4045
+ extra = new ggml_tensor_extra_cl_q6_K();
4046
+ } else {
4047
+ extra = temp_tensor_extras_q6_K.back();
4048
+ temp_tensor_extras_q6_K.pop_back();
4049
+ }
4050
+
4051
+ temp_tensor_extras_q6_K_in_use.push_back(extra);
4052
+
4053
+ extra->reset();
4054
+ return extra;
4055
+ }
4056
+
3477
4057
  void reset() {
3478
4058
  for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
3479
4059
  temp_tensor_extras.push_back(e);
@@ -3485,6 +4065,11 @@ struct ggml_backend_opencl_buffer_context {
3485
4065
  }
3486
4066
  temp_tensor_extras_q4_0_in_use.clear();
3487
4067
 
4068
+ for (ggml_tensor_extra_cl_q4_1 * e : temp_tensor_extras_q4_1_in_use) {
4069
+ temp_tensor_extras_q4_1.push_back(e);
4070
+ }
4071
+ temp_tensor_extras_q4_1_in_use.clear();
4072
+
3488
4073
  for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
3489
4074
  temp_tensor_extras_mxfp4.push_back(e);
3490
4075
  }
@@ -3494,6 +4079,11 @@ struct ggml_backend_opencl_buffer_context {
3494
4079
  temp_tensor_extras_q8_0.push_back(e);
3495
4080
  }
3496
4081
  temp_tensor_extras_q8_0_in_use.clear();
4082
+
4083
+ for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
4084
+ temp_tensor_extras_q6_K.push_back(e);
4085
+ }
4086
+ temp_tensor_extras_q6_K_in_use.clear();
3497
4087
  }
3498
4088
 
3499
4089
  // Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -3505,14 +4095,18 @@ struct ggml_backend_opencl_buffer_context {
3505
4095
  std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
3506
4096
  std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
3507
4097
  std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
4098
+ std::vector<ggml_tensor_extra_cl_q4_1 *> temp_tensor_extras_q4_1;
4099
+ std::vector<ggml_tensor_extra_cl_q4_1 *> temp_tensor_extras_q4_1_in_use;
3508
4100
  std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
3509
4101
  std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
3510
4102
  std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3511
4103
  std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
4104
+ std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K;
4105
+ std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K_in_use;
3512
4106
 
3513
4107
  // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
3514
4108
  // before any tensor is initialized (at the beginning of alloc_tensor_range).
3515
- // Hence, there is alway a buffer object in this vector. When each tensor is
4109
+ // Hence, there is always a buffer object in this vector. When each tensor is
3516
4110
  // being initialized, this original buffer object will be released if both
3517
4111
  // flattening and small allocation are enabled, and additional buffer
3518
4112
  // objects will be created in init_tensor to represent flattened quantized
@@ -3550,7 +4144,7 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
3550
4144
  // Reuse extra of the parent tensor. The offset of this view tensor
3551
4145
  // becomes `extra->offset + view_offs` and needs to be calculated when
3552
4146
  // it is used. This changes is needed because of the change to
3553
- // ggml_alloc.c in https://github.com/ggerganov/llama.cpp/pull/7640.
4147
+ // ggml_alloc.c in https://github.com/ggml-org/llama.cpp/pull/7640.
3554
4148
  // `buffer` passed in here will always be `tensor->buffer`. It is OK
3555
4149
  // to allocate extras from the same buffer context for ordinary
3556
4150
  // intermediate tensors. But for views into kv cache tensors, doing so
@@ -3599,6 +4193,15 @@ inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ct
3599
4193
  return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
3600
4194
  }
3601
4195
 
4196
+ inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
4197
+
4198
+ bool adreno_kernel = use_adreno_kernels(backend_ctx, tensor);
4199
+
4200
+ size_t elem_num = tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
4201
+
4202
+ return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27
4203
+ }
4204
+
3602
4205
  static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
3603
4206
  ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
3604
4207
 
@@ -3638,7 +4241,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
3638
4241
  //GGML_ASSERT(offset == 0);
3639
4242
 
3640
4243
  // We create subbuffers from the original tensor buffer for scales and
3641
- // quants - i.e., scales and quants are aliases into the buffer obejct
4244
+ // quants - i.e., scales and quants are aliases into the buffer object
3642
4245
  // that backs the original tensor. This is a cleaner way to adapt to the
3643
4246
  // new memory management.
3644
4247
  // In the old code, we allocate new buffers for scales and quants
@@ -3863,17 +4466,18 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
3863
4466
  return;
3864
4467
 
3865
4468
  }
3866
- if (tensor->type == GGML_TYPE_MXFP4) {
4469
+ if (tensor->type == GGML_TYPE_Q4_1) {
3867
4470
  ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
3868
4471
  GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
3869
4472
 
3870
4473
  // Allocate the new extra and create aliases from the original.
3871
4474
  ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
3872
- ggml_tensor_extra_cl_mxfp4 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_mxfp4();
4475
+ ggml_tensor_extra_cl_q4_1 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_1();
3873
4476
 
3874
- size_t size_e = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(char);
4477
+ size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
4478
+ size_t size_m = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
3875
4479
  size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
3876
- GGML_ASSERT(size_e + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
4480
+ GGML_ASSERT(size_d + size_m + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
3877
4481
 
3878
4482
  cl_int err;
3879
4483
  cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
@@ -3883,83 +4487,175 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
3883
4487
  queue, data_device, CL_TRUE, 0,
3884
4488
  ggml_nbytes(tensor), data, 0, NULL, NULL));
3885
4489
 
3886
- // The original tensor memory is divided into scales and quants, i.e.,
3887
- // we first store scales, then quants.
3888
4490
  cl_buffer_region region;
3889
4491
 
4492
+ // The original tensor memory is divided into scales and quants, i.e.,
4493
+ // we first store scales, mins, then quants.
3890
4494
  // Create subbuffer for scales.
3891
4495
  region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
3892
- region.size = size_e;
3893
- extra->e = clCreateSubBuffer(
4496
+ region.size = size_d;
4497
+ extra->d = clCreateSubBuffer(
3894
4498
  extra_orig->data_device, CL_MEM_READ_WRITE,
3895
4499
  CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3896
4500
  CL_CHECK(err);
3897
4501
  auto previous_origin = region.origin;
3898
4502
 
4503
+ // Create subbuffer for mins.
4504
+ region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
4505
+ region.size = size_m;
4506
+ extra->m = clCreateSubBuffer(
4507
+ extra_orig->data_device, CL_MEM_READ_WRITE,
4508
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
4509
+ CL_CHECK(err);
4510
+ previous_origin = region.origin;
4511
+
3899
4512
  // Create subbuffer for quants.
3900
- region.origin = align_to(previous_origin + size_e, backend_ctx->alignment);
4513
+ region.origin = align_to(previous_origin + size_m, backend_ctx->alignment);
3901
4514
  region.size = size_q;
3902
4515
  extra->q = clCreateSubBuffer(
3903
4516
  extra_orig->data_device, CL_MEM_READ_WRITE,
3904
4517
  CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3905
4518
  CL_CHECK(err);
3906
4519
 
3907
- #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3908
- if (use_adreno_moe_kernels(backend_ctx, tensor)) {
3909
- cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans;
3910
-
3911
- int ne00 = tensor->ne[0];
3912
- int ne01 = tensor->ne[1];
3913
- int ne02 = tensor->ne[2];
3914
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
3915
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
3916
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
3917
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
3918
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
3919
-
3920
- size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
3921
- size_t local_work_size[3] = {64, 2, 1};
3922
-
3923
- cl_event evt;
3924
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3925
- CL_CHECK(clWaitForEvents(1, &evt));
3926
- CL_CHECK(clReleaseMemObject(data_device));
3927
- tensor->extra = extra;
4520
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
4521
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q4_1;
3928
4522
 
3929
- return;
4523
+ if (use_adreno_kernels(backend_ctx, tensor)) {
4524
+ kernel = backend_ctx->kernel_convert_block_q4_1_noshuffle;
3930
4525
  }
3931
- #endif
3932
- cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4;
3933
-
4526
+ #else
4527
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q4_1;
4528
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
3934
4529
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
3935
4530
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
3936
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
4531
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
4532
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->m));
3937
4533
 
3938
- size_t global_work_size[3] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3939
- size_t local_work_size[3] = {64, 1, 1};
4534
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4535
+ size_t local_work_size[] = {64, 1, 1};
3940
4536
 
3941
4537
  cl_event evt;
3942
4538
  CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3943
4539
  CL_CHECK(clWaitForEvents(1, &evt));
3944
4540
  CL_CHECK(clReleaseMemObject(data_device));
3945
4541
 
3946
- // Create image for Q
3947
- cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
3948
- cl_image_desc img_desc_q = {
3949
- CL_MEM_OBJECT_IMAGE1D_BUFFER,
3950
- static_cast<size_t>(ggml_nelements(tensor)/32*2),
3951
- 0, 0, 0, 0, 0, 0, 0,
3952
- { extra->q }
3953
- };
3954
- extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
3955
4542
  tensor->extra = extra;
3956
4543
 
3957
- return;
3958
- }
3959
- if (tensor->type == GGML_TYPE_Q8_0) {
3960
- ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
3961
- GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
3962
-
4544
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
4545
+ if (use_adreno_kernels(backend_ctx, tensor)) {
4546
+
4547
+ int M = tensor->ne[1];
4548
+ int K = tensor->ne[0];
4549
+
4550
+ GGML_ASSERT(K % 32 == 0);
4551
+
4552
+ // Transpose q as ushort
4553
+ transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
4554
+ // Transpose d as ushort
4555
+ transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
4556
+ // Transpose m as ushort
4557
+ transpose_2d_as_16b(backend_ctx, extra->m, extra->m, size_m, K/32, M);
4558
+ }
4559
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
4560
+ return;
4561
+ }
4562
+ if (tensor->type == GGML_TYPE_MXFP4) {
4563
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
4564
+ GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
4565
+
4566
+ // Allocate the new extra and create aliases from the original.
4567
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
4568
+ ggml_tensor_extra_cl_mxfp4 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_mxfp4();
4569
+
4570
+ size_t size_e = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(char);
4571
+ size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
4572
+ GGML_ASSERT(size_e + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
4573
+
4574
+ cl_int err;
4575
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4576
+ ggml_nbytes(tensor), NULL, &err);
4577
+ CL_CHECK(err);
4578
+ CL_CHECK(clEnqueueWriteBuffer(
4579
+ queue, data_device, CL_TRUE, 0,
4580
+ ggml_nbytes(tensor), data, 0, NULL, NULL));
4581
+
4582
+ // The original tensor memory is divided into scales and quants, i.e.,
4583
+ // we first store scales, then quants.
4584
+ cl_buffer_region region;
4585
+
4586
+ // Create subbuffer for scales.
4587
+ region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
4588
+ region.size = size_e;
4589
+ extra->e = clCreateSubBuffer(
4590
+ extra_orig->data_device, CL_MEM_READ_WRITE,
4591
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
4592
+ CL_CHECK(err);
4593
+ auto previous_origin = region.origin;
4594
+
4595
+ // Create subbuffer for quants.
4596
+ region.origin = align_to(previous_origin + size_e, backend_ctx->alignment);
4597
+ region.size = size_q;
4598
+ extra->q = clCreateSubBuffer(
4599
+ extra_orig->data_device, CL_MEM_READ_WRITE,
4600
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
4601
+ CL_CHECK(err);
4602
+
4603
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
4604
+ if (use_adreno_moe_kernels(backend_ctx, tensor)) {
4605
+ cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans;
4606
+
4607
+ int ne00 = tensor->ne[0];
4608
+ int ne01 = tensor->ne[1];
4609
+ int ne02 = tensor->ne[2];
4610
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
4611
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
4612
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
4613
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
4614
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
4615
+
4616
+ size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
4617
+ size_t local_work_size[3] = {64, 2, 1};
4618
+
4619
+ cl_event evt;
4620
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4621
+ CL_CHECK(clWaitForEvents(1, &evt));
4622
+ CL_CHECK(clReleaseMemObject(data_device));
4623
+ tensor->extra = extra;
4624
+
4625
+ return;
4626
+ }
4627
+ #endif
4628
+ cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4;
4629
+
4630
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
4631
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
4632
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
4633
+
4634
+ size_t global_work_size[3] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4635
+ size_t local_work_size[3] = {64, 1, 1};
4636
+
4637
+ cl_event evt;
4638
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4639
+ CL_CHECK(clWaitForEvents(1, &evt));
4640
+ CL_CHECK(clReleaseMemObject(data_device));
4641
+
4642
+ // Create image for Q
4643
+ cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
4644
+ cl_image_desc img_desc_q = {
4645
+ CL_MEM_OBJECT_IMAGE1D_BUFFER,
4646
+ static_cast<size_t>(ggml_nelements(tensor)/32*2),
4647
+ 0, 0, 0, 0, 0, 0, 0,
4648
+ { extra->q }
4649
+ };
4650
+ extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
4651
+ tensor->extra = extra;
4652
+
4653
+ return;
4654
+ }
4655
+ if (tensor->type == GGML_TYPE_Q8_0) {
4656
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
4657
+ GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
4658
+
3963
4659
  // Allocate the new extra and create aliases from the original.
3964
4660
  ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
3965
4661
  ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0();
@@ -4013,6 +4709,216 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
4013
4709
 
4014
4710
  tensor->extra = extra;
4015
4711
 
4712
+ // Transpose the weights and scales
4713
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
4714
+ if (enable_adreno_trans_weight(backend_ctx, tensor)) {
4715
+
4716
+ int M = tensor->ne[1]; // ne01
4717
+ int K = tensor->ne[0]; // ne00
4718
+
4719
+ GGML_ASSERT(K % 32 == 0);
4720
+ GGML_ASSERT(M % 4 == 0);
4721
+ GGML_ASSERT(tensor->ne[2] == 1);
4722
+ GGML_ASSERT(tensor->ne[3] == 1);
4723
+
4724
+ // Transpose weights
4725
+ size_t q_size_bytes = K * M / 4 * sizeof(float);
4726
+ cl_buffer_region region;
4727
+ region.origin = 0;
4728
+ region.size = q_size_bytes;
4729
+ cl_mem qT_d = clCreateSubBuffer(
4730
+ backend_ctx->prealloc_quant_trans.buffer,
4731
+ 0,
4732
+ CL_BUFFER_CREATE_TYPE_REGION,
4733
+ &region,
4734
+ &err);
4735
+ CL_CHECK(err);
4736
+
4737
+ cl_mem q_d_image1D;
4738
+ cl_mem qT_d_image1D;
4739
+
4740
+ cl_image_format img_fmt_1d;
4741
+ cl_image_desc img_desc_1d;
4742
+
4743
+ img_fmt_1d = { CL_RGBA, CL_FLOAT };
4744
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
4745
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
4746
+ img_desc_1d.image_width = M * K / 4 / 4;
4747
+ img_desc_1d.buffer = extra->q;
4748
+ q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
4749
+ CL_CHECK(err);
4750
+
4751
+ img_fmt_1d = { CL_RGBA, CL_FLOAT };
4752
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
4753
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
4754
+ img_desc_1d.image_width = M * K / 4 / 4;
4755
+ img_desc_1d.buffer = qT_d;
4756
+ qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
4757
+ CL_CHECK(err);
4758
+
4759
+ int height_q = M / 4;
4760
+ int width_q = K / 4 / 4;
4761
+ kernel = backend_ctx->kernel_transpose_32;
4762
+
4763
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
4764
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
4765
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
4766
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
4767
+
4768
+ size_t local_size_q[3] = {4, 16, 1};
4769
+ size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
4770
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
4771
+ CL_CHECK(clWaitForEvents(1, &evt));
4772
+
4773
+ // Transpose scales
4774
+ size_t d_size_bytes = M * (K / 32) * 2;
4775
+ region.origin = 0;
4776
+ region.size = d_size_bytes;
4777
+ cl_mem dT_d = clCreateSubBuffer(
4778
+ backend_ctx->prealloc_scales_trans.buffer,
4779
+ 0,
4780
+ CL_BUFFER_CREATE_TYPE_REGION,
4781
+ &region,
4782
+ &err);
4783
+ CL_CHECK(err);
4784
+
4785
+ cl_mem d_d_image1D;
4786
+ cl_mem dT_d_image1D;
4787
+
4788
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
4789
+ img_fmt_1d = { CL_R, CL_HALF_FLOAT };
4790
+ img_desc_1d.image_width = M * K / 32;
4791
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
4792
+ img_desc_1d.buffer = extra->d;
4793
+ d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
4794
+ CL_CHECK(err);
4795
+
4796
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
4797
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
4798
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
4799
+ img_desc_1d.image_width = M * K / 32 / 4;
4800
+ img_desc_1d.buffer = dT_d;
4801
+ dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
4802
+ CL_CHECK(err);
4803
+
4804
+ int height_s = M / 4;
4805
+ int width_s = K / 32;
4806
+
4807
+ kernel = backend_ctx->kernel_transpose_16_4x1;
4808
+
4809
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
4810
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
4811
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
4812
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
4813
+
4814
+ size_t local_size_s[3] = {4, 16, 1};
4815
+ size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
4816
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
4817
+ CL_CHECK(clWaitForEvents(1, &evt));
4818
+
4819
+ // copy transposed buffer contents to original buffers
4820
+ CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
4821
+ CL_CHECK(clWaitForEvents(1, &evt));
4822
+
4823
+ CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
4824
+ CL_CHECK(clWaitForEvents(1, &evt));
4825
+
4826
+ CL_CHECK(clReleaseMemObject(qT_d));
4827
+ CL_CHECK(clReleaseMemObject(dT_d));
4828
+
4829
+ CL_CHECK(clReleaseMemObject(q_d_image1D));
4830
+ CL_CHECK(clReleaseMemObject(d_d_image1D));
4831
+ CL_CHECK(clReleaseMemObject(qT_d_image1D));
4832
+ CL_CHECK(clReleaseMemObject(dT_d_image1D));
4833
+ } // end transpose
4834
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
4835
+
4836
+ return;
4837
+ }
4838
+ if (tensor->type == GGML_TYPE_Q6_K) {
4839
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
4840
+ GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
4841
+
4842
+ // Allocate the new extra and create aliases from the original.
4843
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
4844
+ ggml_tensor_extra_cl_q6_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q6_K();
4845
+
4846
+ size_t size_ql = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
4847
+ size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/4;
4848
+ size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/16;
4849
+ size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
4850
+ GGML_ASSERT(size_ql + size_qh + size_s + size_d == ggml_nbytes(tensor) &&
4851
+ "Incorrect tensor size");
4852
+
4853
+ cl_int err;
4854
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4855
+ ggml_nbytes(tensor), NULL, &err);
4856
+ CL_CHECK(err);
4857
+ CL_CHECK(clEnqueueWriteBuffer(
4858
+ queue, data_device, CL_TRUE, 0,
4859
+ ggml_nbytes(tensor), data, 0, NULL, NULL));
4860
+
4861
+ cl_buffer_region region;
4862
+
4863
+ // Subbuffer for ql
4864
+ region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
4865
+ region.size = size_ql;
4866
+ extra->ql = clCreateSubBuffer(
4867
+ extra_orig->data_device, CL_MEM_READ_WRITE,
4868
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
4869
+ CL_CHECK(err);
4870
+ auto previous_origin = region.origin;
4871
+
4872
+ // Subbuffer for qh
4873
+ region.origin = align_to(previous_origin + size_ql, backend_ctx->alignment);
4874
+ region.size = size_qh;
4875
+ extra->qh = clCreateSubBuffer(
4876
+ extra_orig->data_device, CL_MEM_READ_WRITE,
4877
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
4878
+ CL_CHECK(err);
4879
+ previous_origin = region.origin;
4880
+
4881
+ // Subbuffer for scales
4882
+ region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment);
4883
+ region.size = size_s;
4884
+ extra->s = clCreateSubBuffer(
4885
+ extra_orig->data_device, CL_MEM_READ_WRITE,
4886
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
4887
+ CL_CHECK(err);
4888
+ previous_origin = region.origin;
4889
+
4890
+ // Create subbuffer for d.
4891
+ region.origin = align_to(previous_origin + size_s, backend_ctx->alignment);
4892
+ region.size = size_d;
4893
+ extra->d = clCreateSubBuffer(
4894
+ extra_orig->data_device, CL_MEM_READ_WRITE,
4895
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
4896
+ CL_CHECK(err);
4897
+ previous_origin = region.origin;
4898
+
4899
+ // Flatten the weights
4900
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q6_K;
4901
+
4902
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
4903
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql));
4904
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
4905
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
4906
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
4907
+
4908
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4909
+ size_t local_work_size[] = {64, 1, 1};
4910
+
4911
+ cl_event evt;
4912
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4913
+ CL_CHECK(clWaitForEvents(1, &evt));
4914
+ CL_CHECK(clReleaseMemObject(data_device));
4915
+
4916
+ extra->size_ql = size_ql;
4917
+ extra->size_qh = size_qh;
4918
+ extra->size_s = size_s;
4919
+ extra->size_d = size_d;
4920
+
4921
+ tensor->extra = extra;
4016
4922
  return;
4017
4923
  }
4018
4924
  #endif // GGML_OPENCL_SOA_Q
@@ -4155,28 +5061,103 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
4155
5061
  size, data, 0, NULL, NULL));
4156
5062
  CL_CHECK(clReleaseMemObject(data_device));
4157
5063
  return;
4158
- } else if (tensor->type == GGML_TYPE_MXFP4) {
4159
- ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra;
5064
+ }
5065
+ if (tensor->type == GGML_TYPE_Q4_1) {
5066
+ ggml_tensor_extra_cl_q4_1 * extra = (ggml_tensor_extra_cl_q4_1 *)tensor->extra;
5067
+
5068
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
5069
+ if (use_adreno_kernels(backend_ctx, tensor)) {
5070
+ static ggml_cl_buffer buf_trans_q;
5071
+ static ggml_cl_buffer buf_trans_m;
5072
+ static ggml_cl_buffer buf_trans_d;
5073
+ static ggml_cl_buffer buf_unpacked;
5074
+
5075
+ cl_int M = tensor->ne[1];
5076
+ cl_int K = tensor->ne[0];
5077
+
5078
+ GGML_ASSERT(K % ggml_blck_size(tensor->type) == 0);
5079
+
5080
+ size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
5081
+ size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
5082
+ size_t size_m = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
5083
+ GGML_ASSERT(size_d + size_q + size_m == ggml_nbytes(tensor) && "Incorrect tensor size");
5084
+
5085
+ buf_trans_q.allocate(backend_ctx->context, size_q);
5086
+ buf_trans_m.allocate(backend_ctx->context, size_m);
5087
+ buf_trans_d.allocate(backend_ctx->context, size_d);
5088
+ buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
5089
+
5090
+ // transpose q, d, m back
5091
+ transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4);
5092
+ transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32);
5093
+ transpose_2d_as_16b(backend_ctx, extra->m, buf_trans_m.buffer, size_m, M, K/32);
5094
+
5095
+ cl_uchar mask_0F = 0x0F;
5096
+ cl_uchar mask_F0 = 0xF0;
5097
+
5098
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
5099
+ size_t local_work_size[] = {1, 1, 1};
5100
+
5101
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q4_1_noshuffle;
5102
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer));
5103
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d.buffer));
5104
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_m.buffer));
5105
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_unpacked.buffer));
5106
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_0F));
5107
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_F0));
5108
+
5109
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5110
+ CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
5111
+ return;
5112
+ }
5113
+ #endif
4160
5114
 
4161
5115
  cl_int err;
4162
5116
  cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4163
5117
  ggml_nbytes(tensor), NULL, &err);
4164
5118
  CL_CHECK(err);
4165
5119
 
4166
- #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
4167
- if (use_adreno_moe_kernels(backend_ctx, tensor)) {
4168
- cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4_trans;
5120
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q4_1;
5121
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
5122
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
5123
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->m));
5124
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &data_device));
4169
5125
 
4170
- int ne00 = tensor->ne[0];
4171
- int ne01 = tensor->ne[1];
4172
- int ne02 = tensor->ne[2];
4173
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
4174
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
4175
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
4176
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
4177
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
5126
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
5127
+ size_t local_work_size[] = {1, 1, 1};
4178
5128
 
4179
- size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
5129
+ cl_event evt;
5130
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5131
+ global_work_size, local_work_size, 0, NULL, &evt));
5132
+ CL_CHECK(clWaitForEvents(1, &evt));
5133
+ CL_CHECK(clEnqueueReadBuffer(
5134
+ queue, data_device, CL_TRUE, offset,
5135
+ size, data, 0, NULL, NULL));
5136
+ CL_CHECK(clReleaseMemObject(data_device));
5137
+ return;
5138
+ }
5139
+ if (tensor->type == GGML_TYPE_MXFP4) {
5140
+ ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra;
5141
+
5142
+ cl_int err;
5143
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
5144
+ ggml_nbytes(tensor), NULL, &err);
5145
+ CL_CHECK(err);
5146
+
5147
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
5148
+ if (use_adreno_moe_kernels(backend_ctx, tensor)) {
5149
+ cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4_trans;
5150
+
5151
+ int ne00 = tensor->ne[0];
5152
+ int ne01 = tensor->ne[1];
5153
+ int ne02 = tensor->ne[2];
5154
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
5155
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
5156
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
5157
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
5158
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
5159
+
5160
+ size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
4180
5161
  size_t local_work_size[3] = {64, 2, 1};
4181
5162
 
4182
5163
  cl_event evt;
@@ -4216,6 +5197,36 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
4216
5197
  ggml_nbytes(tensor), NULL, &err);
4217
5198
  CL_CHECK(err);
4218
5199
 
5200
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
5201
+ if (enable_adreno_trans_weight(backend_ctx, tensor)) {
5202
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0_trans;
5203
+
5204
+ int ne00 = tensor->ne[0];
5205
+ int ne01 = tensor->ne[1];
5206
+ GGML_ASSERT(tensor->ne[2] == 1);
5207
+ GGML_ASSERT(tensor->ne[3] == 1);
5208
+
5209
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
5210
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
5211
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
5212
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
5213
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
5214
+
5215
+ size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), 1, 1};
5216
+ size_t local_work_size[3] = {64, 1, 1};
5217
+
5218
+ cl_event evt;
5219
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5220
+ global_work_size, local_work_size, 0, NULL, &evt));
5221
+ CL_CHECK(clWaitForEvents(1, &evt));
5222
+
5223
+ CL_CHECK(clEnqueueReadBuffer(
5224
+ queue, data_device, CL_TRUE, offset,
5225
+ size, data, 0, NULL, NULL));
5226
+ CL_CHECK(clReleaseMemObject(data_device));
5227
+ return;
5228
+ }
5229
+ #endif
4219
5230
  cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
4220
5231
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
4221
5232
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
@@ -4224,6 +5235,34 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
4224
5235
  size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4225
5236
  size_t local_work_size[] = {1, 1, 1};
4226
5237
 
5238
+ cl_event evt;
5239
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5240
+ global_work_size, local_work_size, 0, NULL, &evt));
5241
+ CL_CHECK(clWaitForEvents(1, &evt));
5242
+ CL_CHECK(clEnqueueReadBuffer(
5243
+ queue, data_device, CL_TRUE, offset,
5244
+ size, data, 0, NULL, NULL));
5245
+ CL_CHECK(clReleaseMemObject(data_device));
5246
+ return;
5247
+ }
5248
+ if (tensor->type == GGML_TYPE_Q6_K) {
5249
+ ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
5250
+
5251
+ cl_int err;
5252
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
5253
+ ggml_nbytes(tensor), NULL, &err);
5254
+ CL_CHECK(err);
5255
+
5256
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q6_K;
5257
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql));
5258
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
5259
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
5260
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
5261
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
5262
+
5263
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
5264
+ size_t local_work_size[] = {1, 1, 1};
5265
+
4227
5266
  cl_event evt;
4228
5267
  CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4229
5268
  global_work_size, local_work_size, 0, NULL, &evt));
@@ -4347,7 +5386,8 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
4347
5386
  }
4348
5387
 
4349
5388
  static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
4350
- *free = 0;
5389
+ // no memory to report
5390
+ *free = 0;
4351
5391
  *total = 0;
4352
5392
 
4353
5393
  GGML_UNUSED(dev);
@@ -4666,6 +5706,81 @@ static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct gg
4666
5706
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
4667
5707
  }
4668
5708
 
5709
+ // Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but
5710
+ // nb[] is recalculated such that tensor is contiguous.
5711
+ static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor * src, cl_mem dst,
5712
+ cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) {
5713
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5714
+
5715
+ const int tensor_type_size = ggml_type_size(src->type);
5716
+
5717
+ const int ne00 = src->ne[0];
5718
+ const int ne01 = src->ne[1];
5719
+ const int ne02 = src->ne[2];
5720
+ const int ne03 = src->ne[3];
5721
+
5722
+ const cl_ulong nb00 = src->nb[0];
5723
+ const cl_ulong nb01 = src->nb[1];
5724
+ const cl_ulong nb02 = src->nb[2];
5725
+ const cl_ulong nb03 = src->nb[3];
5726
+
5727
+ const int ne0 = src->ne[0];
5728
+ const int ne1 = src->ne[1];
5729
+ const int ne2 = src->ne[2];
5730
+ const int ne3 = src->ne[3];
5731
+
5732
+ nb0 = tensor_type_size;
5733
+ nb1 = tensor_type_size*ne00;
5734
+ nb2 = tensor_type_size*ne00*ne01;
5735
+ nb3 = tensor_type_size*ne00*ne01*ne02;
5736
+
5737
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *)src->extra;
5738
+
5739
+ cl_ulong offset0 = extra->offset + src->view_offs;
5740
+ cl_ulong offsetd = 0;
5741
+
5742
+ cl_kernel kernel;
5743
+
5744
+ switch (src->type) {
5745
+ case GGML_TYPE_F32:
5746
+ kernel = backend_ctx->kernel_cpy_f32_f32;
5747
+ break;
5748
+ case GGML_TYPE_F16:
5749
+ kernel = backend_ctx->kernel_cpy_f16_f16;
5750
+ break;
5751
+ default:
5752
+ GGML_ASSERT(false && "not implemented");
5753
+ }
5754
+
5755
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->data_device));
5756
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5757
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst));
5758
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
5759
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
5760
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
5761
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
5762
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
5763
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
5764
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
5765
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
5766
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
5767
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
5768
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
5769
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne2));
5770
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne3));
5771
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb0));
5772
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1));
5773
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2));
5774
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3));
5775
+
5776
+ const int nth = MIN(64, ne00);
5777
+
5778
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5779
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
5780
+
5781
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src);
5782
+ }
5783
+
4669
5784
  static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4670
5785
  UNUSED(backend);
4671
5786
  UNUSED(src0);
@@ -4681,19 +5796,12 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
4681
5796
  GGML_ASSERT(dst);
4682
5797
  GGML_ASSERT(dst->extra);
4683
5798
 
4684
- const int ne00 = src0->ne[0];
4685
- const cl_ulong nb01 = src0->nb[1];
4686
- const cl_ulong nb02 = src0->nb[2];
4687
- const cl_ulong nb03 = src0->nb[3];
4688
- const int ne10 = src1->ne[0];
4689
- const cl_ulong nb10 = src1->nb[0];
4690
- const int ne11 = src1->ne[1];
4691
- const int ne12 = src1->ne[2];
4692
- const cl_ulong nb11 = src1->nb[1];
4693
- const cl_ulong nb12 = src1->nb[2];
4694
- const cl_ulong nb1 = dst->nb[1];
4695
- const cl_ulong nb2 = dst->nb[2];
4696
- const cl_ulong nb3 = dst->nb[3];
5799
+ GGML_TENSOR_LOCALS(int, ne0, src0, ne);
5800
+ GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
5801
+ GGML_TENSOR_LOCALS(int, ne1, src1, ne);
5802
+ GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
5803
+ GGML_TENSOR_LOCALS(int, ne, dst, ne);
5804
+ GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
4697
5805
 
4698
5806
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4699
5807
 
@@ -4739,8 +5847,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
4739
5847
  CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
4740
5848
  CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
4741
5849
 
4742
- size_t global_work_size[] = {(size_t)ne10*64, (size_t)ne11, (size_t)ne12};
4743
- size_t local_work_size[] = {64, 1, 1};
5850
+ int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
5851
+ int nth = 1;
5852
+ while (nth < ne00 && 2*nth <= max_workgroup_size) {
5853
+ nth *= 2;
5854
+ }
5855
+
5856
+ size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
5857
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
4744
5858
 
4745
5859
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4746
5860
  }
@@ -5595,7 +6709,6 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
5595
6709
  GGML_UNUSED(src1);
5596
6710
 
5597
6711
  GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
5598
- GGML_ASSERT(ggml_is_contiguous(src0));
5599
6712
 
5600
6713
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5601
6714
 
@@ -5618,7 +6731,14 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
5618
6731
  const cl_ulong nb2 = dst->nb[2];
5619
6732
  const cl_ulong nb3 = dst->nb[3];
5620
6733
 
5621
- cl_kernel kernel = backend_ctx->kernel_mean_f32;
6734
+ cl_kernel kernel;
6735
+
6736
+ const bool is_c4 = ne00 % 4 == 0;
6737
+ if (is_c4) {
6738
+ kernel = backend_ctx->kernel_mean_f32_4;
6739
+ } else {
6740
+ kernel = backend_ctx->kernel_mean_f32;
6741
+ }
5622
6742
 
5623
6743
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5624
6744
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -5635,7 +6755,7 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
5635
6755
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
5636
6756
  CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
5637
6757
 
5638
- size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
6758
+ size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
5639
6759
  size_t local_work_size[] = {(size_t)64, 1, 1};
5640
6760
 
5641
6761
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
@@ -5941,6 +7061,44 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
5941
7061
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
5942
7062
  }
5943
7063
 
7064
+ static void ggml_cl_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7065
+ GGML_ASSERT(src0);
7066
+ GGML_ASSERT(src0->extra);
7067
+ GGML_ASSERT(dst);
7068
+ GGML_ASSERT(dst->extra);
7069
+
7070
+ UNUSED(src1);
7071
+
7072
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7073
+
7074
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7075
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
7076
+
7077
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
7078
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
7079
+
7080
+ const int tri_type = ggml_get_op_params_i32(dst, 0);
7081
+ const int64_t n = ggml_nelements(dst);
7082
+ const int ne0 = dst->ne[0];
7083
+ const int ne1 = dst->ne[1];
7084
+
7085
+ cl_kernel kernel = backend_ctx->kernel_tri;
7086
+
7087
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7088
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7089
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7090
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7091
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &n));
7092
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne0));
7093
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne1));
7094
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &tri_type));
7095
+
7096
+ size_t local_work_size[1] = { 256 };
7097
+ size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
7098
+
7099
+ backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
7100
+ }
7101
+
5944
7102
  static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5945
7103
  GGML_ASSERT(dst);
5946
7104
  GGML_ASSERT(dst->extra);
@@ -6436,6 +7594,64 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
6436
7594
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6437
7595
  }
6438
7596
 
7597
+ static void ggml_cl_l2_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7598
+ GGML_ASSERT(src0);
7599
+ GGML_ASSERT(src0->extra);
7600
+ GGML_ASSERT(dst);
7601
+ GGML_ASSERT(dst->extra);
7602
+
7603
+ UNUSED(src1);
7604
+
7605
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7606
+
7607
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7608
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
7609
+
7610
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
7611
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
7612
+
7613
+ float eps;
7614
+ memcpy(&eps, dst->op_params, sizeof(float));
7615
+
7616
+ GGML_TENSOR_LOCALS(int, ne0, src0, ne);
7617
+ GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
7618
+
7619
+ size_t sgs;
7620
+ if (backend_ctx->gpu_family == ADRENO) {
7621
+ sgs = 64;
7622
+ } else if (backend_ctx->gpu_family == INTEL) {
7623
+ sgs = 32;
7624
+ } else {
7625
+ GGML_ASSERT(false && "Unsupported GPU");
7626
+ }
7627
+
7628
+ cl_kernel kernel = backend_ctx->kernel_l2_norm_f32;
7629
+
7630
+ int nth = sgs;
7631
+ while (nth < ne00 && nth < (int)backend_ctx->get_kernel_workgroup_size(kernel)) {
7632
+ nth *= 2;
7633
+ }
7634
+
7635
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7636
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7637
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7638
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7639
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
7640
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
7641
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
7642
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
7643
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
7644
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
7645
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
7646
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
7647
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
7648
+
7649
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
7650
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
7651
+
7652
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
7653
+ }
7654
+
6439
7655
  static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6440
7656
  GGML_ASSERT(src0);
6441
7657
  GGML_ASSERT(src0->extra);
@@ -6449,82 +7665,172 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
6449
7665
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6450
7666
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6451
7667
 
6452
- cl_ulong offset0_abs = extra0->offset + src0->view_offs;
6453
- cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
7668
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
7669
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
7670
+
7671
+ const int ne00 = src0->ne[0];
7672
+ const int ne01 = src0->ne[1];
7673
+ const int ne02 = src0->ne[2];
7674
+ const int ne03 = src0->ne[3];
7675
+
7676
+ const cl_ulong nb00 = src0->nb[0];
7677
+ const cl_ulong nb01 = src0->nb[1];
7678
+ const cl_ulong nb02 = src0->nb[2];
7679
+ const cl_ulong nb03 = src0->nb[3];
7680
+
7681
+ const cl_ulong nb0 = dst->nb[0];
7682
+ const cl_ulong nb1 = dst->nb[1];
7683
+ const cl_ulong nb2 = dst->nb[2];
7684
+ const cl_ulong nb3 = dst->nb[3];
6454
7685
 
6455
7686
  cl_kernel kernel;
6456
- if (dst->type == GGML_TYPE_F32) {
6457
- kernel = backend_ctx->kernel_tanh_f32_nd;
6458
- } else if (dst->type == GGML_TYPE_F16) {
6459
- kernel = backend_ctx->kernel_tanh_f16_nd;
7687
+
7688
+ if (ggml_is_contiguous(src0)) {
7689
+ // Handle contiguous input
7690
+ int n = ggml_nelements(dst);
7691
+ if (n % 4 == 0) {
7692
+ if (src0->type == GGML_TYPE_F32) {
7693
+ kernel = backend_ctx->kernel_tanh_f32_4;
7694
+ } else {
7695
+ kernel = backend_ctx->kernel_tanh_f16_4;
7696
+ }
7697
+ n /= 4;
7698
+ } else {
7699
+ if (src0->type == GGML_TYPE_F32) {
7700
+ kernel = backend_ctx->kernel_tanh_f32;
7701
+ } else {
7702
+ kernel = backend_ctx->kernel_tanh_f16;
7703
+ }
7704
+ }
7705
+
7706
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7707
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7708
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7709
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7710
+
7711
+ size_t global_work_size[] = {(size_t)n, 1, 1};
7712
+ size_t local_work_size[] = {64, 1, 1};
7713
+
7714
+ size_t * local_work_size_ptr = local_work_size;
7715
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
7716
+ local_work_size_ptr = nullptr;
7717
+ }
7718
+
7719
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
6460
7720
  } else {
6461
- GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
7721
+ // Handle non-contiguous input
7722
+ if (src0->type == GGML_TYPE_F32) {
7723
+ kernel = backend_ctx->kernel_tanh_f32_nc;
7724
+ } else {
7725
+ kernel = backend_ctx->kernel_tanh_f16_nc;
7726
+ }
7727
+
7728
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7729
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7730
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7731
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7732
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
7733
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
7734
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
7735
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
7736
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
7737
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
7738
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
7739
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
7740
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
7741
+
7742
+ int nth = 64;
7743
+
7744
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
7745
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
7746
+
7747
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6462
7748
  }
6463
- GGML_ASSERT(kernel != nullptr);
7749
+ }
6464
7750
 
6465
- const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
6466
- const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
7751
+ static void ggml_cl_neg(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7752
+ GGML_ASSERT(src0);
7753
+ GGML_ASSERT(src0->extra);
7754
+ GGML_ASSERT(dst);
7755
+ GGML_ASSERT(dst->extra);
6467
7756
 
6468
- const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
6469
- const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
7757
+ UNUSED(src1);
6470
7758
 
6471
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6472
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
6473
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6474
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
7759
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6475
7760
 
6476
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
6477
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
6478
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
6479
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
6480
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
6481
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
6482
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
6483
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
7761
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7762
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6484
7763
 
6485
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
6486
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
6487
- CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
6488
- CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
6489
- CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
6490
- CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
6491
- CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
6492
- CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
6493
-
6494
- size_t global_work_size[3];
6495
- if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
6496
- return;
6497
- }
6498
- global_work_size[0] = (size_t)ne10;
6499
- global_work_size[1] = (size_t)ne11;
6500
- global_work_size[2] = (size_t)ne12;
7764
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
7765
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
6501
7766
 
6502
- size_t lws0 = 16, lws1 = 4, lws2 = 1;
6503
- if (ne10 < 16) lws0 = ne10;
6504
- if (ne11 < 4) lws1 = ne11;
6505
- if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
7767
+ GGML_TENSOR_LOCALS(int, ne0, src0, ne);
7768
+ GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
7769
+ GGML_TENSOR_LOCALS(int, ne, dst, ne);
7770
+ GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
6506
7771
 
6507
- while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
6508
- while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
6509
- while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
7772
+ cl_kernel kernel;
7773
+
7774
+ if (ggml_is_contiguous(src0)) {
7775
+ // Handle contiguous input
7776
+ int n = ggml_nelements(dst);
7777
+ if (n % 4 == 0) {
7778
+ if (src0->type == GGML_TYPE_F32) {
7779
+ kernel = backend_ctx->kernel_neg_f32_4;
7780
+ } else {
7781
+ kernel = backend_ctx->kernel_neg_f16_4;
7782
+ }
7783
+ n /= 4;
7784
+ } else {
7785
+ if (src0->type == GGML_TYPE_F32) {
7786
+ kernel = backend_ctx->kernel_neg_f32;
7787
+ } else {
7788
+ kernel = backend_ctx->kernel_neg_f16;
7789
+ }
7790
+ }
6510
7791
 
7792
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7793
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7794
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7795
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7796
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &n));
6511
7797
 
6512
- size_t local_work_size[] = {lws0, lws1, lws2};
7798
+ size_t global_work_size[] = {(size_t)CEIL_DIV(n, 64)*64, 1, 1};
7799
+ size_t local_work_size[] = {64, 1, 1};
6513
7800
 
6514
- size_t* local_work_size_ptr = local_work_size;
6515
- if (!backend_ctx->non_uniform_workgroups) {
6516
- if (global_work_size[0] % local_work_size[0] != 0 ||
6517
- global_work_size[1] % local_work_size[1] != 0 ||
6518
- global_work_size[2] % local_work_size[2] != 0) {
6519
- local_work_size_ptr = NULL;
7801
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
7802
+ } else {
7803
+ // Handle non-contiguous input
7804
+ if (src0->type == GGML_TYPE_F32) {
7805
+ kernel = backend_ctx->kernel_neg_f32_nc;
7806
+ } else {
7807
+ kernel = backend_ctx->kernel_neg_f16_nc;
6520
7808
  }
6521
- }
6522
- if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
6523
7809
 
6524
- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
7810
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7811
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7812
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7813
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7814
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
7815
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
7816
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
7817
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
7818
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
7819
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
7820
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
7821
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
7822
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
7823
+
7824
+ int nth = 64;
7825
+
7826
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
7827
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
7828
+
7829
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
7830
+ }
6525
7831
  }
6526
7832
 
6527
- static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7833
+ static void ggml_cl_exp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6528
7834
  GGML_ASSERT(src0);
6529
7835
  GGML_ASSERT(src0->extra);
6530
7836
  GGML_ASSERT(dst);
@@ -6537,18 +7843,90 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
6537
7843
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6538
7844
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6539
7845
 
6540
- cl_ulong offset0_abs = extra0->offset + src0->view_offs;
6541
- cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
7846
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
7847
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
7848
+
7849
+ GGML_TENSOR_LOCALS(int, ne0, src0, ne);
7850
+ GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
7851
+ GGML_TENSOR_LOCALS(int, ne, dst, ne);
7852
+ GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
6542
7853
 
6543
7854
  cl_kernel kernel;
6544
- if (dst->type == GGML_TYPE_F32) {
6545
- kernel = backend_ctx->kernel_expm1_f32_nd;
6546
- } else if (dst->type == GGML_TYPE_F16) {
6547
- kernel = backend_ctx->kernel_expm1_f16_nd;
7855
+
7856
+ if (ggml_is_contiguous(src0)) {
7857
+ // Handle contiguous input
7858
+ int n = ggml_nelements(dst);
7859
+ if (n % 4 == 0) {
7860
+ if (src0->type == GGML_TYPE_F32) {
7861
+ kernel = backend_ctx->kernel_exp_f32_4;
7862
+ } else {
7863
+ kernel = backend_ctx->kernel_exp_f16_4;
7864
+ }
7865
+ n /= 4;
7866
+ } else {
7867
+ if (src0->type == GGML_TYPE_F32) {
7868
+ kernel = backend_ctx->kernel_exp_f32;
7869
+ } else {
7870
+ kernel = backend_ctx->kernel_exp_f16;
7871
+ }
7872
+ }
7873
+
7874
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7875
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7876
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7877
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7878
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &n));
7879
+
7880
+ size_t global_work_size[] = {(size_t)CEIL_DIV(n, 64)*64, 1, 1};
7881
+ size_t local_work_size[] = {64, 1, 1};
7882
+
7883
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6548
7884
  } else {
6549
- GGML_ASSERT(false && "Unsupported type for ggml_cl_expm1");
7885
+ // Handle non-contiguous input
7886
+ if (src0->type == GGML_TYPE_F32) {
7887
+ kernel = backend_ctx->kernel_exp_f32_nc;
7888
+ } else {
7889
+ kernel = backend_ctx->kernel_exp_f16_nc;
7890
+ }
7891
+
7892
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7893
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7894
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7895
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7896
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
7897
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
7898
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
7899
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
7900
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
7901
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
7902
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
7903
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
7904
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
7905
+
7906
+ int nth = 64;
7907
+
7908
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
7909
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
7910
+
7911
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6550
7912
  }
6551
- GGML_ASSERT(kernel != nullptr);
7913
+ }
7914
+
7915
+ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7916
+ GGML_ASSERT(src0);
7917
+ GGML_ASSERT(src0->extra);
7918
+ GGML_ASSERT(dst);
7919
+ GGML_ASSERT(dst->extra);
7920
+
7921
+ UNUSED(src1);
7922
+
7923
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7924
+
7925
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7926
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
7927
+
7928
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
7929
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
6552
7930
 
6553
7931
  const int ne00 = src0->ne[0];
6554
7932
  const int ne01 = src0->ne[1];
@@ -6560,70 +7938,74 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
6560
7938
  const cl_ulong nb02 = src0->nb[2];
6561
7939
  const cl_ulong nb03 = src0->nb[3];
6562
7940
 
6563
- const int ne10 = dst->ne[0];
6564
- const int ne11 = dst->ne[1];
6565
- const int ne12 = dst->ne[2];
6566
- const int ne13 = dst->ne[3];
7941
+ const cl_ulong nb0 = dst->nb[0];
7942
+ const cl_ulong nb1 = dst->nb[1];
7943
+ const cl_ulong nb2 = dst->nb[2];
7944
+ const cl_ulong nb3 = dst->nb[3];
6567
7945
 
6568
- const cl_ulong nb10 = dst->nb[0];
6569
- const cl_ulong nb11 = dst->nb[1];
6570
- const cl_ulong nb12 = dst->nb[2];
6571
- const cl_ulong nb13 = dst->nb[3];
7946
+ cl_kernel kernel;
6572
7947
 
6573
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6574
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
6575
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6576
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
7948
+ if (ggml_is_contiguous(src0)) {
7949
+ // Handle contiguous input
7950
+ int n = ggml_nelements(dst);
7951
+ if (n % 4 == 0) {
7952
+ if (src0->type == GGML_TYPE_F32) {
7953
+ kernel = backend_ctx->kernel_expm1_f32_4;
7954
+ } else {
7955
+ kernel = backend_ctx->kernel_expm1_f16_4;
7956
+ }
7957
+ n /= 4;
7958
+ } else {
7959
+ if (src0->type == GGML_TYPE_F32) {
7960
+ kernel = backend_ctx->kernel_expm1_f32;
7961
+ } else {
7962
+ kernel = backend_ctx->kernel_expm1_f16;
7963
+ }
7964
+ }
6577
7965
 
6578
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
6579
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
6580
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
6581
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
6582
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
6583
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
6584
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
6585
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
7966
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7967
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7968
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7969
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6586
7970
 
6587
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
6588
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
6589
- CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
6590
- CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
6591
- CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
6592
- CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
6593
- CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
6594
- CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
6595
-
6596
- size_t global_work_size[3];
6597
- if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
6598
- return;
6599
- }
6600
- global_work_size[0] = (size_t)ne10;
6601
- global_work_size[1] = (size_t)ne11;
6602
- global_work_size[2] = (size_t)ne12;
7971
+ size_t global_work_size[] = {(size_t)n, 1, 1};
7972
+ size_t local_work_size[] = {64, 1, 1};
6603
7973
 
6604
- size_t lws0 = 16, lws1 = 4, lws2 = 1;
6605
- if (ne10 < 16) lws0 = ne10;
6606
- if (ne11 < 4) lws1 = ne11;
6607
- if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
7974
+ size_t * local_work_size_ptr = local_work_size;
7975
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
7976
+ local_work_size_ptr = nullptr;
7977
+ }
6608
7978
 
6609
- while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
6610
- while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
6611
- while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
7979
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
7980
+ } else {
7981
+ // Handle non-contiguous input
7982
+ if (src0->type == GGML_TYPE_F32) {
7983
+ kernel = backend_ctx->kernel_expm1_f32_nc;
7984
+ } else {
7985
+ kernel = backend_ctx->kernel_expm1_f16_nc;
7986
+ }
6612
7987
 
7988
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7989
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7990
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7991
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7992
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
7993
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
7994
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
7995
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
7996
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
7997
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
7998
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
7999
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
8000
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
8001
+
8002
+ int nth = 64;
6613
8003
 
6614
- size_t local_work_size[] = {lws0, lws1, lws2};
8004
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
8005
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
6615
8006
 
6616
- size_t* local_work_size_ptr = local_work_size;
6617
- if (!backend_ctx->non_uniform_workgroups) {
6618
- if (global_work_size[0] % local_work_size[0] != 0 ||
6619
- global_work_size[1] % local_work_size[1] != 0 ||
6620
- global_work_size[2] % local_work_size[2] != 0) {
6621
- local_work_size_ptr = NULL;
6622
- }
8007
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6623
8008
  }
6624
- if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
6625
-
6626
- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
6627
8009
  }
6628
8010
 
6629
8011
  static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6639,18 +8021,8 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
6639
8021
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6640
8022
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6641
8023
 
6642
- cl_ulong offset0_abs = extra0->offset + src0->view_offs;
6643
- cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
6644
-
6645
- cl_kernel kernel;
6646
- if (dst->type == GGML_TYPE_F32) {
6647
- kernel = backend_ctx->kernel_softplus_f32_nd;
6648
- } else if (dst->type == GGML_TYPE_F16) {
6649
- kernel = backend_ctx->kernel_softplus_f16_nd;
6650
- } else {
6651
- GGML_ASSERT(false && "Unsupported type for ggml_cl_softplus");
6652
- }
6653
- GGML_ASSERT(kernel != nullptr);
8024
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
8025
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
6654
8026
 
6655
8027
  const int ne00 = src0->ne[0];
6656
8028
  const int ne01 = src0->ne[1];
@@ -6662,70 +8034,74 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
6662
8034
  const cl_ulong nb02 = src0->nb[2];
6663
8035
  const cl_ulong nb03 = src0->nb[3];
6664
8036
 
6665
- const int ne10 = dst->ne[0];
6666
- const int ne11 = dst->ne[1];
6667
- const int ne12 = dst->ne[2];
6668
- const int ne13 = dst->ne[3];
8037
+ const cl_ulong nb0 = dst->nb[0];
8038
+ const cl_ulong nb1 = dst->nb[1];
8039
+ const cl_ulong nb2 = dst->nb[2];
8040
+ const cl_ulong nb3 = dst->nb[3];
6669
8041
 
6670
- const cl_ulong nb10 = dst->nb[0];
6671
- const cl_ulong nb11 = dst->nb[1];
6672
- const cl_ulong nb12 = dst->nb[2];
6673
- const cl_ulong nb13 = dst->nb[3];
8042
+ cl_kernel kernel;
6674
8043
 
6675
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6676
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
6677
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6678
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
8044
+ if (ggml_is_contiguous(src0)) {
8045
+ // Handle contiguous input
8046
+ int n = ggml_nelements(dst);
8047
+ if (n % 4 == 0) {
8048
+ if (src0->type == GGML_TYPE_F32) {
8049
+ kernel = backend_ctx->kernel_softplus_f32_4;
8050
+ } else {
8051
+ kernel = backend_ctx->kernel_softplus_f16_4;
8052
+ }
8053
+ n /= 4;
8054
+ } else {
8055
+ if (src0->type == GGML_TYPE_F32) {
8056
+ kernel = backend_ctx->kernel_softplus_f32;
8057
+ } else {
8058
+ kernel = backend_ctx->kernel_softplus_f16;
8059
+ }
8060
+ }
6679
8061
 
6680
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
6681
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
6682
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
6683
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
6684
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
6685
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
6686
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
6687
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
8062
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
8063
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
8064
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
8065
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6688
8066
 
6689
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
6690
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
6691
- CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
6692
- CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
6693
- CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
6694
- CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
6695
- CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
6696
- CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
6697
-
6698
- size_t global_work_size[3];
6699
- if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
6700
- return;
6701
- }
6702
- global_work_size[0] = (size_t)ne10;
6703
- global_work_size[1] = (size_t)ne11;
6704
- global_work_size[2] = (size_t)ne12;
8067
+ size_t global_work_size[] = {(size_t)n, 1, 1};
8068
+ size_t local_work_size[] = {64, 1, 1};
6705
8069
 
6706
- size_t lws0 = 16, lws1 = 4, lws2 = 1;
6707
- if (ne10 < 16) lws0 = ne10;
6708
- if (ne11 < 4) lws1 = ne11;
6709
- if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
8070
+ size_t * local_work_size_ptr = local_work_size;
8071
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
8072
+ local_work_size_ptr = nullptr;
8073
+ }
6710
8074
 
6711
- while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
6712
- while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
6713
- while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
8075
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
8076
+ } else {
8077
+ // Handle non-contiguous input
8078
+ if (src0->type == GGML_TYPE_F32) {
8079
+ kernel = backend_ctx->kernel_softplus_f32_nc;
8080
+ } else {
8081
+ kernel = backend_ctx->kernel_softplus_f16_nc;
8082
+ }
6714
8083
 
8084
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
8085
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
8086
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
8087
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
8088
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
8089
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
8090
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
8091
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
8092
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
8093
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
8094
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
8095
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
8096
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
8097
+
8098
+ int nth = 64;
6715
8099
 
6716
- size_t local_work_size[] = {lws0, lws1, lws2};
8100
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
8101
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
6717
8102
 
6718
- size_t* local_work_size_ptr = local_work_size;
6719
- if (!backend_ctx->non_uniform_workgroups) {
6720
- if (global_work_size[0] % local_work_size[0] != 0 ||
6721
- global_work_size[1] % local_work_size[1] != 0 ||
6722
- global_work_size[2] % local_work_size[2] != 0) {
6723
- local_work_size_ptr = NULL;
6724
- }
8103
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6725
8104
  }
6726
- if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
6727
-
6728
- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
6729
8105
  }
6730
8106
 
6731
8107
  static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
@@ -6739,53 +8115,58 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
6739
8115
 
6740
8116
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6741
8117
 
6742
- if (backend_ctx->kernel_repeat == nullptr) {
6743
- GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
6744
- return;
6745
- }
8118
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
8119
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6746
8120
 
6747
- ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
6748
- ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
8121
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
8122
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
6749
8123
 
6750
- cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
6751
- cl_ulong off_dst = extra_dst->offset + dst->view_offs;
8124
+ const int ne00 = src0->ne[0];
8125
+ const int ne01 = src0->ne[1];
8126
+ const int ne02 = src0->ne[2];
8127
+ const int ne03 = src0->ne[3];
8128
+
8129
+ const cl_ulong nb00 = src0->nb[0];
8130
+ const cl_ulong nb01 = src0->nb[1];
8131
+ const cl_ulong nb02 = src0->nb[2];
8132
+ const cl_ulong nb03 = src0->nb[3];
8133
+
8134
+ const int ne0 = dst->ne[0];
8135
+ const int ne1 = dst->ne[1];
8136
+ const int ne2 = dst->ne[2];
8137
+ const int ne3 = dst->ne[3];
6752
8138
 
6753
- const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
6754
- const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
8139
+ const cl_ulong nb0 = dst->nb[0];
8140
+ const cl_ulong nb1 = dst->nb[1];
8141
+ const cl_ulong nb2 = dst->nb[2];
8142
+ const cl_ulong nb3 = dst->nb[3];
8143
+
8144
+ cl_kernel kernel = backend_ctx->kernel_repeat_f32;
6755
8145
 
6756
- const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
6757
- const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
8146
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
8147
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
8148
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
8149
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
8150
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
8151
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
8152
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
8153
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
8154
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
8155
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
8156
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
8157
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
8158
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
8159
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb0));
8160
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
8161
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
8162
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
6758
8163
 
6759
- cl_kernel kernel = backend_ctx->kernel_repeat;
8164
+ int nth = 64;
6760
8165
 
6761
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
6762
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device));
6763
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
6764
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
6765
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
6766
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
6767
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
6768
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
6769
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
6770
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
6771
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
6772
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
6773
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
6774
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
6775
- CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
6776
- CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
6777
- CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
6778
- CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
6779
- CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
6780
- CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
6781
-
6782
- size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
6783
- size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
6784
- size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
6785
-
6786
- size_t global_work_size[] = { gws0, gws1, gws2 };
8166
+ size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
8167
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
6787
8168
 
6788
- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
8169
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6789
8170
  }
6790
8171
 
6791
8172
  static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -7009,121 +8390,76 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
7009
8390
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
7010
8391
 
7011
8392
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7012
- cl_command_queue queue = backend_ctx->queue;
7013
8393
 
7014
- if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
7015
- GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
7016
- return;
7017
- }
8394
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
8395
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
8396
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
8397
+
8398
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
8399
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
8400
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
8401
+
8402
+ const int ne00 = src0->ne[0];
8403
+ const int ne01 = src0->ne[1];
8404
+ const int ne02 = src0->ne[2];
8405
+ const int ne03 = src0->ne[3];
8406
+
8407
+ const cl_ulong nb00 = src0->nb[0];
8408
+ const cl_ulong nb01 = src0->nb[1];
8409
+ const cl_ulong nb02 = src0->nb[2];
8410
+ const cl_ulong nb03 = src0->nb[3];
8411
+
8412
+ const cl_ulong nb10 = src1->nb[0];
8413
+ const cl_ulong nb11 = src1->nb[1];
8414
+ const cl_ulong nb12 = src1->nb[2];
8415
+ const cl_ulong nb13 = src1->nb[3];
7018
8416
 
7019
- ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
7020
- ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
7021
- ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
8417
+ const int ne0 = dst->ne[0];
8418
+ const int ne1 = dst->ne[1];
8419
+ const int ne2 = dst->ne[2];
8420
+ const int ne3 = dst->ne[3];
7022
8421
 
7023
- cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
7024
- cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
7025
- cl_ulong off_dst = extrad_cl->offset + dst->view_offs;
8422
+ const cl_ulong nb0 = dst->nb[0];
8423
+ const cl_ulong nb1 = dst->nb[1];
8424
+ const cl_ulong nb2 = dst->nb[2];
8425
+ const cl_ulong nb3 = dst->nb[3];
7026
8426
 
7027
- const int32_t dim = ((const int32_t *) dst->op_params)[0];
8427
+ const cl_int dim = ((const int32_t *) dst->op_params)[0];
7028
8428
  GGML_ASSERT(dim >= 0 && dim <= 3);
7029
8429
 
7030
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
7031
- if (dim == 3) {
8430
+ int nth = MIN(64, ne0);
7032
8431
 
7033
- size_t nbytes_src0 = ggml_nbytes(src0);
7034
- size_t nbytes_src1 = ggml_nbytes(src1);
8432
+ cl_kernel kernel = backend_ctx->kernel_concat_f32;
7035
8433
 
7036
- CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
7037
- off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
7038
- CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
7039
- off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
7040
- } else {
8434
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
8435
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
8436
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
8437
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
8438
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
8439
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
8440
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
8441
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
8442
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
8443
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
8444
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
8445
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
8446
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
8447
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
8448
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
8449
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
8450
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
8451
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
8452
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
8453
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
8454
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
8455
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
8456
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
8457
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
8458
+
8459
+ size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
8460
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
7041
8461
 
7042
- cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
7043
- size_t global_work_size[3];
7044
-
7045
- for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
7046
- cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
7047
- cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
7048
- cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
7049
-
7050
- int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
7051
- int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
7052
- int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
7053
-
7054
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
7055
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &current_off_src0));
7056
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
7057
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &current_off_src1));
7058
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
7059
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &current_off_dst));
7060
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
7061
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
7062
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
7063
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
7064
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
7065
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
7066
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
7067
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
7068
- CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
7069
- CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
7070
-
7071
- global_work_size[0] = d_ne0;
7072
- global_work_size[1] = d_ne1;
7073
- global_work_size[2] = d_ne2;
7074
-
7075
- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
7076
- }
7077
- }
7078
- } else {
7079
- cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
7080
-
7081
- cl_long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
7082
- cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
7083
-
7084
- cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
7085
-
7086
- cl_long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
7087
- cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
7088
-
7089
-
7090
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
7091
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
7092
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
7093
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
7094
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
7095
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
7096
-
7097
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &ne00));
7098
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &ne01));
7099
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &ne02));
7100
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &ne03));
7101
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
7102
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
7103
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
7104
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
7105
-
7106
- CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
7107
- CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
7108
- CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
7109
- CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
7110
-
7111
- CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_long), &d_ne0));
7112
- CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_long), &d_ne1));
7113
- CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_long), &d_ne2));
7114
- CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_long), &d_ne3));
7115
- CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
7116
- CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
7117
- CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
7118
- CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
7119
- CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
7120
-
7121
- size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
7122
- d_ne2 > 0 ? (size_t)d_ne2 : 1,
7123
- d_ne3 > 0 ? (size_t)d_ne3 : 1 };
7124
-
7125
- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
7126
- }
8462
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
7127
8463
  }
7128
8464
 
7129
8465
  static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -7496,82 +8832,503 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten
7496
8832
  region.size = nb02 * ne02;
7497
8833
  }
7498
8834
 
7499
- A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
8835
+ A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
8836
+ CL_CHECK(status);
8837
+
8838
+ // <--------------------------------------------> //
8839
+
8840
+ // create sub-buffer for B
8841
+ // <--------------------------------------------> //
8842
+ region.origin = (extra1->offset);
8843
+ region.size = nb10 * ne10 * ne11 * ne12;
8844
+ B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
8845
+ CL_CHECK(status);
8846
+ // <--------------------------------------------> //
8847
+
8848
+ img_fmt_1d = {CL_RGBA, CL_FLOAT};
8849
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
8850
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8851
+ if (nb01 > nb02) {
8852
+ img_desc_1d.image_width = (nb01 * ne01 / 4)/4;
8853
+ }
8854
+ else {
8855
+ img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
8856
+ }
8857
+ img_desc_1d.buffer = A_sub_buffer;
8858
+ A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
8859
+ CL_CHECK(status);
8860
+
8861
+ // create sub-buffer for output C
8862
+ // <--------------------------------------------> //
8863
+ region.origin = (extrad->offset);
8864
+ region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
8865
+ D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
8866
+ CL_CHECK(status);
8867
+ // <--------------------------------------------> //
8868
+
8869
+ // create image for C output
8870
+ // <--------------------------------------------> //
8871
+ img_fmt_1d = {CL_R, CL_FLOAT};
8872
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
8873
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8874
+ img_desc_1d.image_width = ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4;
8875
+ img_desc_1d.buffer = D_sub_buffer;
8876
+ D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
8877
+ CL_CHECK(status);
8878
+ // <--------------------------------------------> //
8879
+
8880
+ int offset_src0 = 0;
8881
+ int offset_src1 = 0;
8882
+
8883
+ // set kernel args
8884
+ // <--------------------------------------------> //
8885
+ cl_uint k_arg = 0;
8886
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
8887
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src0));
8888
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_sub_buffer));
8889
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src1));
8890
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &D_image1d));
8891
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
8892
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &M));
8893
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &K));
8894
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &N));
8895
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
8896
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
8897
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &nb01));
8898
+
8899
+ size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
8900
+ size_t local_work_size[3] = {64, 1, 2};
8901
+
8902
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8903
+
8904
+ // deallocate sub buffers and images
8905
+ // <--------------------------------------------> //
8906
+ CL_CHECK(clReleaseMemObject(A_image1d));
8907
+ CL_CHECK(clReleaseMemObject(D_image1d));
8908
+ CL_CHECK(clReleaseMemObject(A_sub_buffer));
8909
+ CL_CHECK(clReleaseMemObject(B_sub_buffer));
8910
+ CL_CHECK(clReleaseMemObject(D_sub_buffer));
8911
+ }
8912
+
8913
+ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8914
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
8915
+ GGML_ASSERT(src0);
8916
+ GGML_ASSERT(src0->extra);
8917
+ GGML_ASSERT(src1);
8918
+ GGML_ASSERT(src1->extra);
8919
+ GGML_ASSERT(dst);
8920
+ GGML_ASSERT(dst->extra);
8921
+
8922
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
8923
+
8924
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
8925
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
8926
+ ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
8927
+
8928
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
8929
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
8930
+
8931
+ const int ne00 = src0->ne[0];
8932
+ const int ne01 = src0->ne[1];
8933
+
8934
+ const int ne1 = dst->ne[1];
8935
+
8936
+ GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
8937
+
8938
+ cl_context context = backend_ctx->context;
8939
+ cl_kernel kernel;
8940
+
8941
+ cl_int err;
8942
+ cl_image_format img_fmt;
8943
+ cl_image_desc img_desc;
8944
+ cl_buffer_region region;
8945
+
8946
+ int M = ne01;
8947
+ int N = ne1;
8948
+ int K = ne00;
8949
+
8950
+ if (ne1 == 1) {
8951
+ cl_mem q_img = nullptr;
8952
+ cl_mem b_sub_buf = nullptr;
8953
+ cl_mem b_img = nullptr;
8954
+
8955
+ // image for q
8956
+ img_fmt = { CL_R, CL_UNSIGNED_INT32};
8957
+ memset(&img_desc, 0, sizeof(img_desc));
8958
+ img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8959
+ img_desc.image_width = M * K / 2 / 4;
8960
+ img_desc.buffer = extra0_q4_1->q;
8961
+ CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
8962
+
8963
+ // subbuffer for activations
8964
+ region.origin = offset1;
8965
+ region.size = K * N * sizeof(float);
8966
+ CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
8967
+
8968
+ // image for activations
8969
+ img_fmt = {CL_RGBA, CL_FLOAT};
8970
+ memset(&img_desc, 0, sizeof(img_desc));
8971
+ img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8972
+ img_desc.image_width = K * N / 4;
8973
+ img_desc.buffer = b_sub_buf;
8974
+ CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
8975
+
8976
+ kernel = backend_ctx->kernel_gemv_noshuffle_q4_1_f32;
8977
+
8978
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
8979
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_1->d));
8980
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_1->m));
8981
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &b_img));
8982
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
8983
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
8984
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne00));
8985
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01));
8986
+
8987
+ size_t local_work_size[3] = {64, 4, 1};
8988
+ size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
8989
+
8990
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8991
+
8992
+ CL_CHECK(clReleaseMemObject(q_img));
8993
+ CL_CHECK(clReleaseMemObject(b_sub_buf));
8994
+ CL_CHECK(clReleaseMemObject(b_img));
8995
+ } else {
8996
+ cl_mem b_sub_buf = nullptr;
8997
+ cl_mem b_sub_buf_trans = nullptr;
8998
+ cl_mem b_img = nullptr;
8999
+ cl_mem b_img_trans = nullptr;
9000
+
9001
+ // subbuffer for activations
9002
+ region.origin = offset1;
9003
+ region.size = K * N * sizeof(float);
9004
+ CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
9005
+
9006
+ // image for activations
9007
+ img_fmt = {CL_RGBA, CL_FLOAT};
9008
+ memset(&img_desc, 0, sizeof(img_desc));
9009
+ img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9010
+ img_desc.image_width = K * N / 4;
9011
+ img_desc.buffer = b_sub_buf;
9012
+ CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
9013
+
9014
+ // pad N to multiple of 8
9015
+ int extra_elements = N % 8;
9016
+ int padding = 0;
9017
+ if (extra_elements > 0){
9018
+ padding = 8 - extra_elements;
9019
+ }
9020
+
9021
+ // subbuffer for transposed activations
9022
+ region.origin = 0;
9023
+ region.size = K * (N + padding) * sizeof(float)/2;
9024
+ backend_ctx->prealloc_act_trans.allocate(context, region.size);
9025
+ CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
9026
+
9027
+ // image for transposed activations
9028
+ img_fmt = {CL_RGBA, CL_HALF_FLOAT};
9029
+ memset(&img_desc, 0, sizeof(img_desc));
9030
+ img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9031
+ img_desc.image_width = K * (N + padding) / 4;
9032
+ img_desc.buffer = b_sub_buf_trans;
9033
+ CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
9034
+
9035
+ // transpose activations
9036
+ int height_B = N/4;
9037
+ if (height_B == 0) {
9038
+ height_B = 1;
9039
+ }
9040
+ int width_B = K/4;
9041
+ int padded_height_B = (N + padding)/4;
9042
+
9043
+ kernel = backend_ctx->kernel_transpose_32_16;
9044
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
9045
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
9046
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
9047
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
9048
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
9049
+
9050
+ size_t local_work_size_t[2] = { 1, 16 };
9051
+ size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
9052
+ backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
9053
+
9054
+ // gemm
9055
+ kernel = backend_ctx->kernel_gemm_noshuffle_q4_1_f32;
9056
+ int padded_N = N + padding;
9057
+
9058
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_1->q));
9059
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_1->d));
9060
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_1->m));
9061
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &b_img_trans));
9062
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9063
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9064
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01));
9065
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &padded_N));
9066
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne00));
9067
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne1));
9068
+
9069
+ size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
9070
+ size_t local_work_size[3] = {1, 128, 1};
9071
+
9072
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9073
+
9074
+ CL_CHECK(clReleaseMemObject(b_sub_buf));
9075
+ CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
9076
+ CL_CHECK(clReleaseMemObject(b_img));
9077
+ CL_CHECK(clReleaseMemObject(b_img_trans));
9078
+ }
9079
+ #else
9080
+ GGML_UNUSED(backend);
9081
+ GGML_UNUSED(src0);
9082
+ GGML_UNUSED(src1);
9083
+ GGML_UNUSED(dst);
9084
+ #endif
9085
+ }
9086
+
9087
+ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9088
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
9089
+ GGML_ASSERT(src0);
9090
+ GGML_ASSERT(src0->extra);
9091
+ GGML_ASSERT(src1);
9092
+ GGML_ASSERT(src1->extra);
9093
+ GGML_ASSERT(dst);
9094
+ GGML_ASSERT(dst->extra);
9095
+
9096
+ const enum ggml_type src0t = src0->type;
9097
+ const enum ggml_type src1t = src1->type;
9098
+
9099
+ GGML_ASSERT(src0t == GGML_TYPE_Q8_0);
9100
+ GGML_ASSERT(src1t == GGML_TYPE_F32);
9101
+
9102
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
9103
+
9104
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
9105
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
9106
+
9107
+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
9108
+
9109
+ GGML_ASSERT(src1->view_offs == 0);
9110
+ GGML_ASSERT(dst->view_offs == 0);
9111
+
9112
+ const int ne00 = src0->ne[0];
9113
+ const int ne01 = src0->ne[1];
9114
+ const int ne02 = src0->ne[2];
9115
+
9116
+ const int ne10 = src1->ne[0];
9117
+ const int ne12 = src1->ne[2];
9118
+
9119
+ const int ne0 = dst->ne[0];
9120
+ const int ne1 = dst->ne[1];
9121
+
9122
+ GGML_ASSERT(ne00 == ne10);
9123
+ GGML_ASSERT((ne00 % 32) == 0);
9124
+ GGML_ASSERT(ne0 == ne01);
9125
+
9126
+ cl_context context = backend_ctx->context;
9127
+ cl_kernel kernel;
9128
+
9129
+ // init CL objects
9130
+ cl_int status;
9131
+ cl_image_format img_fmt_1d;
9132
+ cl_image_desc img_desc_1d;
9133
+ cl_buffer_region region;
9134
+ cl_mem A_image1d;
9135
+ cl_mem B_image1d;
9136
+ cl_mem B_sub_buffer;
9137
+ cl_mem S_image1d;
9138
+
9139
+ cl_mem D_image1d;
9140
+ cl_mem D_sub_buffer;
9141
+
9142
+ int M = ne01;
9143
+ int N = ne1;
9144
+ int K = ne00;
9145
+
9146
+ // create an image for A
9147
+ img_fmt_1d = { CL_R, CL_FLOAT};
9148
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
9149
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9150
+ img_desc_1d.image_width = M * K / 4; // Divide by 4 for char -> float
9151
+ img_desc_1d.buffer = extra0_q8_0->q;
9152
+ A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
7500
9153
  CL_CHECK(status);
7501
9154
 
7502
- // <--------------------------------------------> //
9155
+ // create an image for Scale
9156
+ img_fmt_1d = { CL_R, CL_HALF_FLOAT};
9157
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
9158
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9159
+ img_desc_1d.image_width = M * K / 32; // Block size is 32
9160
+ img_desc_1d.buffer = extra0_q8_0->d;
9161
+ S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
9162
+ CL_CHECK(status);
7503
9163
 
7504
- // create sub-buffer for B
7505
- // <--------------------------------------------> //
7506
- region.origin = (extra1->offset);
7507
- region.size = nb10 * ne10 * ne11 * ne12;
9164
+ // create a sub_buffer for B
9165
+ region.origin = (extra1->offset); // + src1->view_offs);
9166
+ region.size = K * N * sizeof(float);
7508
9167
  B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
7509
9168
  CL_CHECK(status);
7510
- // <--------------------------------------------> //
7511
9169
 
9170
+ // create an image for B from sub_buffer: RGBA (OCL)
7512
9171
  img_fmt_1d = {CL_RGBA, CL_FLOAT};
7513
9172
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
7514
9173
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
7515
- if (nb01 > nb02) {
7516
- img_desc_1d.image_width = (nb01 * ne01 / 4)/4;
7517
- }
7518
- else {
7519
- img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
7520
- }
7521
- img_desc_1d.buffer = A_sub_buffer;
7522
- A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
9174
+ img_desc_1d.image_width = K * N / 4;
9175
+ img_desc_1d.buffer = B_sub_buffer;
9176
+ B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
7523
9177
  CL_CHECK(status);
7524
9178
 
7525
- // create sub-buffer for output C
7526
- // <--------------------------------------------> //
7527
- region.origin = (extrad->offset);
7528
- region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
9179
+ // Create subbuffer and image1d_buffer for dst
9180
+ region.origin = (extrad->offset); // + dst->view_offs;
9181
+ region.size = M * N * sizeof(float);
7529
9182
  D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
7530
9183
  CL_CHECK(status);
7531
- // <--------------------------------------------> //
7532
9184
 
7533
- // create image for C output
7534
- // <--------------------------------------------> //
7535
9185
  img_fmt_1d = {CL_R, CL_FLOAT};
7536
9186
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
7537
9187
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
7538
- img_desc_1d.image_width = ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4;
9188
+ img_desc_1d.image_width = M * N;
7539
9189
  img_desc_1d.buffer = D_sub_buffer;
7540
9190
  D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
7541
9191
  CL_CHECK(status);
7542
- // <--------------------------------------------> //
7543
9192
 
7544
- int offset_src0 = 0;
7545
- int offset_src1 = 0;
9193
+ size_t local_work_size[3] = {1, 1, 1};
9194
+ size_t global_work_size[3] = {1, 1, 1};
7546
9195
 
7547
- // set kernel args
7548
- // <--------------------------------------------> //
7549
- cl_uint k_arg = 0;
7550
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
7551
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src0));
7552
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_sub_buffer));
7553
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src1));
7554
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &D_image1d));
7555
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
7556
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &M));
7557
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &K));
7558
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &N));
7559
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
7560
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
7561
- CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &nb01));
9196
+ if (N == 1) {
9197
+ kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32;
7562
9198
 
7563
- size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
7564
- size_t local_work_size[3] = {64, 1, 2};
9199
+ int r2 = 1;
9200
+ int r3 = 1;
9201
+ cl_uint k_arg = 0;
9202
+
9203
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
9204
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q8_0->d));
9205
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
9206
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
9207
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
9208
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
9209
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
9210
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
9211
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
9212
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
9213
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
9214
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
9215
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
9216
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
9217
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
9218
+
9219
+ size_t wavesize = backend_ctx->adreno_wave_size;
9220
+ local_work_size[0] = wavesize;
9221
+ local_work_size[1] = 4; // reduce factor
9222
+ local_work_size[2] = 1;
9223
+
9224
+ global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize;
9225
+ global_work_size[1] = 4; // reduce factor
9226
+ global_work_size[2] = 1;
9227
+ } else {
9228
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
9229
+ cl_mem B_image1d_trans = nullptr;
9230
+ // for B transpose
9231
+ cl_mem B_d = nullptr;
9232
+ int padding;
9233
+
9234
+ //how many extra elements beyond multiple of 8
9235
+ int extra_elements = N % 8;
9236
+
9237
+ //how much padding to add
9238
+ padding = 0;
9239
+ if (extra_elements > 0){
9240
+ padding = 8 - extra_elements;
9241
+ }
9242
+
9243
+ // Specify the starting offset (in bytes)
9244
+ region.origin = 0;
9245
+ // Specify the size of the sub-buffer (divide by 2 for FP16)
9246
+ region.size = K * (N + padding) * sizeof(float)/2;
9247
+ backend_ctx->prealloc_act_trans.allocate(context, region.size);
9248
+ B_d = clCreateSubBuffer(
9249
+ backend_ctx->prealloc_act_trans.buffer,
9250
+ 0,
9251
+ CL_BUFFER_CREATE_TYPE_REGION,
9252
+ &region,
9253
+ &status);
9254
+ CL_CHECK(status);
9255
+
9256
+ cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
9257
+ cl_image_desc image_desc_B_d_output = {
9258
+ CL_MEM_OBJECT_IMAGE1D_BUFFER,
9259
+ static_cast<size_t>(K * (N + padding)/4),
9260
+ 0, 0, 0, 0, 0, 0, 0, { B_d }
9261
+ };
9262
+ B_image1d_trans = clCreateImage(
9263
+ context,
9264
+ 0,
9265
+ &image_format_B_d_output,
9266
+ &image_desc_B_d_output,
9267
+ NULL,
9268
+ &status);
9269
+ CL_CHECK(status);
9270
+
9271
+ int height_B = N/4;
9272
+ if (height_B == 0) {
9273
+ height_B = 1;
9274
+ }
9275
+ int width_B = K/4;
9276
+ int padded_height_B = (N + padding)/4;
9277
+
9278
+ kernel = backend_ctx->kernel_transpose_32_16;
9279
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d));
9280
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans));
9281
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
9282
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
9283
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
9284
+
9285
+ size_t local_size_t[2] = { 1, 16 };
9286
+ size_t global_size_t[2] = {
9287
+ static_cast<size_t>(width_B),
9288
+ static_cast<size_t>(padded_height_B)
9289
+ };
9290
+
9291
+ backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
9292
+
9293
+ kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4;
9294
+
9295
+ int N_with_padding = N + padding;
9296
+
9297
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
9298
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
9299
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d_trans));
9300
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
9301
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &K));
9302
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &M));
9303
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &N_with_padding));
9304
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &N));
9305
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
9306
+
9307
+ global_work_size[0] = (size_t)(N + 7) / 8;
9308
+ global_work_size[1] = (size_t)(M + 3) / 4;
9309
+ global_work_size[2] = 1;
7565
9310
 
9311
+ local_work_size[0] = 2;
9312
+ local_work_size[1] = 128;
9313
+ local_work_size[2] = 1;
9314
+ }
9315
+
9316
+ // enqueue kernel with profiling
7566
9317
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
7567
9318
 
7568
9319
  // deallocate sub buffers and images
7569
- // <--------------------------------------------> //
7570
9320
  CL_CHECK(clReleaseMemObject(A_image1d));
7571
- CL_CHECK(clReleaseMemObject(D_image1d));
7572
- CL_CHECK(clReleaseMemObject(A_sub_buffer));
7573
9321
  CL_CHECK(clReleaseMemObject(B_sub_buffer));
9322
+ CL_CHECK(clReleaseMemObject(B_image1d));
9323
+ CL_CHECK(clReleaseMemObject(S_image1d));
7574
9324
  CL_CHECK(clReleaseMemObject(D_sub_buffer));
9325
+ CL_CHECK(clReleaseMemObject(D_image1d));
9326
+ #else
9327
+ GGML_UNUSED(backend);
9328
+ GGML_UNUSED(src0);
9329
+ GGML_UNUSED(src1);
9330
+ GGML_UNUSED(dst);
9331
+ #endif
7575
9332
  }
7576
9333
 
7577
9334
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7597,8 +9354,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7597
9354
 
7598
9355
  #ifdef GGML_OPENCL_SOA_Q
7599
9356
  ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
9357
+ ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
7600
9358
  ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
7601
9359
  ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
9360
+ ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
7602
9361
  #endif
7603
9362
 
7604
9363
  const int ne00 = src0 ? src0->ne[0] : 0;
@@ -7641,9 +9400,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7641
9400
  cl_context context = backend_ctx->context;
7642
9401
 
7643
9402
  if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
7644
- if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0) {
9403
+ if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0 &&
9404
+ // dst is wrapped with image1d_buffer, the size limit applies, also src0
9405
+ (ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4 <= backend_ctx->image_max_buffer_size)) {
7645
9406
  // For KQ
7646
9407
  if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
9408
+ ((nb01 * ne01 / 4)/4 <= backend_ctx->image_max_buffer_size) &&
7647
9409
  nb00 <= nb02 &&
7648
9410
  nb02 <= nb01 &&
7649
9411
  nb01 <= nb03 &&
@@ -7654,7 +9416,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7654
9416
  return;
7655
9417
  }
7656
9418
  // For KQV
7657
- if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
9419
+ if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
9420
+ ((nb02 * ne02 / 4)/4 <= backend_ctx->image_max_buffer_size)) {
7658
9421
  ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
7659
9422
  return;
7660
9423
  }
@@ -7686,6 +9449,23 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7686
9449
  int padding;
7687
9450
  // <--------------------------------------------> //
7688
9451
 
9452
+ // NOTE: Kernels using image1d_buffer_t (e.g., src0_q) would normally require
9453
+ // a limit check, but q4_0 / q4_1 tensors are very unlikely to exceed that
9454
+ // limit, so the check is omitted.
9455
+
9456
+ // q4_1 x fp32
9457
+ if (src0t == GGML_TYPE_Q4_1 && src1t == GGML_TYPE_F32) {
9458
+ ggml_cl_mul_mat_q4_1_f32_adreno(backend, src0, src1, dst);
9459
+ return;
9460
+ }
9461
+
9462
+ // q8_0 x fp32
9463
+ if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 &&
9464
+ enable_adreno_trans_weight(backend_ctx, src0)) {
9465
+ ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst);
9466
+ return;
9467
+ }
9468
+
7689
9469
  // q4_0 x fp32
7690
9470
  if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
7691
9471
  // TODO: remove duplicate definitions of image description + format -- move to top
@@ -7960,9 +9740,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7960
9740
 
7961
9741
  // GEMM using local memory
7962
9742
  // Current BK = 16, so ne00 % 16 == 0
7963
- if (ggml_is_contiguous(src0) &&
7964
- ggml_is_contiguous(src1) &&
7965
- src1t == GGML_TYPE_F32 &&
9743
+ if (src1t == GGML_TYPE_F32 &&
7966
9744
  ne00 % 16 == 0 &&
7967
9745
  ne11 > 1) {
7968
9746
  switch(src0t) {
@@ -7974,10 +9752,42 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
7974
9752
  int batch_stride_b = ne10*ne11;
7975
9753
  int batch_stride_d = ne0*ne1;
7976
9754
 
7977
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7978
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7979
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7980
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9755
+ cl_mem mem_src0 = extra0->data_device;
9756
+ cl_mem mem_src1 = extra1->data_device;
9757
+
9758
+ cl_ulong nb00_cont = nb00;
9759
+ cl_ulong nb01_cont = nb01;
9760
+ cl_ulong nb02_cont = nb02;
9761
+ cl_ulong nb03_cont = nb03;
9762
+
9763
+ cl_ulong nb10_cont = nb10;
9764
+ cl_ulong nb11_cont = nb11;
9765
+ cl_ulong nb12_cont = nb12;
9766
+ cl_ulong nb13_cont = nb13;
9767
+
9768
+ cl_ulong offset0_cont = offset0;
9769
+ cl_ulong offset1_cont = offset1;
9770
+
9771
+ if (!ggml_is_contiguous(src0)) {
9772
+ backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
9773
+ ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
9774
+ nb00_cont, nb01_cont, nb02_cont, nb03_cont);
9775
+ mem_src0 = backend_ctx->prealloc_src0.buffer;
9776
+ offset0_cont = 0;
9777
+ }
9778
+
9779
+ if (!ggml_is_contiguous(src1)) {
9780
+ backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
9781
+ ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
9782
+ nb10_cont, nb11_cont, nb12_cont, nb13_cont);
9783
+ mem_src1 = backend_ctx->prealloc_src1.buffer;
9784
+ offset1_cont = 0;
9785
+ }
9786
+
9787
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
9788
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont));
9789
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
9790
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont));
7981
9791
  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
7982
9792
  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
7983
9793
  CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
@@ -8009,8 +9819,82 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
8009
9819
  int batch_stride_b = ne10*ne11;
8010
9820
  int batch_stride_d = ne0*ne1;
8011
9821
 
8012
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
8013
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
9822
+ cl_mem mem_src0 = extra0->data_device;
9823
+ cl_mem mem_src1 = extra1->data_device;
9824
+
9825
+ cl_ulong nb00_cont = nb00;
9826
+ cl_ulong nb01_cont = nb01;
9827
+ cl_ulong nb02_cont = nb02;
9828
+ cl_ulong nb03_cont = nb03;
9829
+
9830
+ cl_ulong nb10_cont = nb10;
9831
+ cl_ulong nb11_cont = nb11;
9832
+ cl_ulong nb12_cont = nb12;
9833
+ cl_ulong nb13_cont = nb13;
9834
+
9835
+ cl_ulong offset0_cont = offset0;
9836
+ cl_ulong offset1_cont = offset1;
9837
+
9838
+ if (!ggml_is_contiguous(src0)) {
9839
+ backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
9840
+ ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
9841
+ nb00_cont, nb01_cont, nb02_cont, nb03_cont);
9842
+ mem_src0 = backend_ctx->prealloc_src0.buffer;
9843
+ offset0_cont = 0;
9844
+ }
9845
+
9846
+ if (!ggml_is_contiguous(src1)) {
9847
+ backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
9848
+ ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
9849
+ nb10_cont, nb11_cont, nb12_cont, nb13_cont);
9850
+ mem_src1 = backend_ctx->prealloc_src1.buffer;
9851
+ offset1_cont = 0;
9852
+ }
9853
+
9854
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
9855
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont));
9856
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
9857
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont));
9858
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9859
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9860
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9861
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
9862
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
9863
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
9864
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
9865
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
9866
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
9867
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
9868
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
9869
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
9870
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
9871
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
9872
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
9873
+
9874
+ // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
9875
+ size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
9876
+ size_t local_work_size[] = {(size_t)nth0, 1, 1};
9877
+
9878
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9879
+ return;
9880
+ }
9881
+ case GGML_TYPE_Q4_0: {
9882
+ if (ne11 < 32) {
9883
+ break;
9884
+ }
9885
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
9886
+ break;
9887
+ }
9888
+
9889
+ kernel = backend_ctx->kernel_mul_mm_q4_0_f32_l4_lm;
9890
+ nth0 = 128; // calculated as (BM*BN)/(TM*TN)
9891
+
9892
+ int batch_stride_a = ne00*ne01;
9893
+ int batch_stride_b = ne10*ne11;
9894
+ int batch_stride_d = ne0*ne1;
9895
+
9896
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
9897
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
8014
9898
  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
8015
9899
  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
8016
9900
  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
@@ -8036,10 +9920,57 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
8036
9920
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8037
9921
  return;
8038
9922
  }
9923
+ case GGML_TYPE_Q4_1: {
9924
+ if (ne11 < 32) {
9925
+ break;
9926
+ }
9927
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
9928
+ break;
9929
+ }
9930
+
9931
+ kernel = backend_ctx->kernel_mul_mm_q4_1_f32_l4_lm;
9932
+ nth0 = 128; // calculated as (BM*BN)/(TM*TN)
9933
+
9934
+ int batch_stride_a = ne00*ne01;
9935
+ int batch_stride_b = ne10*ne11;
9936
+ int batch_stride_d = ne0*ne1;
9937
+
9938
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_1->q));
9939
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_1->d));
9940
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_1->m));
9941
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra1->data_device));
9942
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset1));
9943
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
9944
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
9945
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne00));
9946
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne01));
9947
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne02));
9948
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
9949
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
9950
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_a
9951
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_b
9952
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne01)); // stride_d
9953
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_a));
9954
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_b));
9955
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_d));
9956
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r2));
9957
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r3));
9958
+
9959
+ // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
9960
+ size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
9961
+ size_t local_work_size[] = {(size_t)nth0, 1, 1};
9962
+
9963
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9964
+ return;
9965
+ }
8039
9966
  case GGML_TYPE_Q8_0: {
8040
9967
  if (ne11 < 32) {
8041
9968
  break;
8042
9969
  }
9970
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
9971
+ break;
9972
+ }
9973
+
8043
9974
  kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm;
8044
9975
  nth0 = 128; // calculated as (BM*BN)/(TM*TN)
8045
9976
 
@@ -8074,6 +10005,50 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
8074
10005
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8075
10006
  return;
8076
10007
  }
10008
+ case GGML_TYPE_Q6_K: {
10009
+ if (ne11 < 32) {
10010
+ break;
10011
+ }
10012
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
10013
+ break;
10014
+ }
10015
+
10016
+ kernel = backend_ctx->kernel_mul_mm_q6_k_f32_l4_lm;
10017
+ nth0 = 128; // calculated as (BM*BN)/(TM*TN)
10018
+
10019
+ int batch_stride_a = ne00*ne01;
10020
+ int batch_stride_b = ne10*ne11;
10021
+ int batch_stride_d = ne0*ne1;
10022
+
10023
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q6_K->ql));
10024
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q6_K->qh));
10025
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s));
10026
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d));
10027
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
10028
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
10029
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
10030
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
10031
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
10032
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
10033
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
10034
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
10035
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
10036
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_a
10037
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); // stride_b
10038
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne01)); // stride_d
10039
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_a));
10040
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_b));
10041
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &batch_stride_d));
10042
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2));
10043
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3));
10044
+
10045
+ // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
10046
+ size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
10047
+ size_t local_work_size[] = {(size_t)nth0, 1, 1};
10048
+
10049
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10050
+ return;
10051
+ }
8077
10052
  default:
8078
10053
  break;
8079
10054
  }
@@ -8328,7 +10303,71 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
8328
10303
  CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
8329
10304
  #endif // GGML_OPENCL_SOA_Q
8330
10305
  break;
8331
- case GGML_TYPE_Q4_1:
10306
+ case GGML_TYPE_Q4_1: {
10307
+ #ifdef GGML_OPENCL_SOA_Q
10308
+ if (backend_ctx->gpu_family == INTEL) {
10309
+ nth0 = 16;
10310
+ nth1 = 1;
10311
+ ndst = 4;
10312
+ } else if (backend_ctx->gpu_family == ADRENO) {
10313
+ nth0 = 64;
10314
+ nth1 = 1;
10315
+ ndst = 4;
10316
+ } else {
10317
+ GGML_ASSERT(false && "TODO: Unknown GPU");
10318
+ }
10319
+
10320
+ kernel = backend_ctx->kernel_mul_mv_q4_1_f32_flat;
10321
+
10322
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_1->q));
10323
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_1->d));
10324
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_1->m));
10325
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra1->data_device));
10326
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset1));
10327
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
10328
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
10329
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne00));
10330
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne01));
10331
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne02));
10332
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
10333
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
10334
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
10335
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
10336
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r2));
10337
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r3));
10338
+ #else
10339
+ if (backend_ctx->gpu_family == INTEL) {
10340
+ nth0 = 16;
10341
+ nth1 = 1;
10342
+ ndst = 4;
10343
+ } else if (backend_ctx->gpu_family == ADRENO) {
10344
+ nth0 = 64;
10345
+ nth1 = 1;
10346
+ ndst = 4;
10347
+ } else {
10348
+ GGML_ASSERT(false && "TODO: Unknown GPU");
10349
+ }
10350
+
10351
+ kernel = backend_ctx->kernel_mul_mv_q4_1_f32;
10352
+
10353
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10354
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10355
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
10356
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
10357
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
10358
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
10359
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
10360
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
10361
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
10362
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
10363
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
10364
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
10365
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
10366
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
10367
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
10368
+ #endif // GGML_OPENCL_SOA_Q
10369
+ break;
10370
+ }
8332
10371
  case GGML_TYPE_Q8_0: {
8333
10372
  #ifdef GGML_OPENCL_SOA_Q
8334
10373
  kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
@@ -8407,19 +10446,89 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
8407
10446
  #endif // GGML_OPENCL_SOA_Q
8408
10447
  break;
8409
10448
  }
8410
- case GGML_TYPE_Q2_K:
8411
- case GGML_TYPE_Q3_K:
8412
- case GGML_TYPE_Q4_K:
10449
+ case GGML_TYPE_Q2_K:
10450
+ case GGML_TYPE_Q3_K:
10451
+ case GGML_TYPE_Q4_K: {
10452
+ kernel = backend_ctx->kernel_mul_mv_q4_K_f32;
10453
+
10454
+ if (backend_ctx->gpu_family == INTEL) {
10455
+ nth0 = 16;
10456
+ nth1 = 1;
10457
+ ndst = 4;
10458
+ } else if (backend_ctx->gpu_family == ADRENO) {
10459
+ nth0 = 64;
10460
+ nth1 = 1;
10461
+ ndst = 4;
10462
+ } else {
10463
+ GGML_ASSERT(false && "TODO: Unknown GPU");
10464
+ }
10465
+
10466
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10467
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
10468
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
10469
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
10470
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
10471
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
10472
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
10473
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
10474
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
10475
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
10476
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
10477
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
10478
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
10479
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
10480
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
10481
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
10482
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
10483
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
10484
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
10485
+ break;
10486
+ }
8413
10487
  case GGML_TYPE_Q5_K:
8414
10488
  case GGML_TYPE_Q6_K:
10489
+ #ifdef GGML_OPENCL_SOA_Q
10490
+ kernel = backend_ctx->kernel_mul_mv_q6_K_f32_flat;
10491
+
10492
+ if (backend_ctx->gpu_family == INTEL) {
10493
+ nth0 = 16;
10494
+ nth1 = 2;
10495
+ ndst = 4;
10496
+ } else if (backend_ctx->gpu_family == ADRENO) {
10497
+ nth0 = 64;
10498
+ nth1 = 2;
10499
+ ndst = 4;
10500
+ } else {
10501
+ GGML_ASSERT(false && "TODO: Unknown GPU");
10502
+ }
10503
+
10504
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q6_K->ql));
10505
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q6_K->qh));
10506
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s));
10507
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d));
10508
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
10509
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
10510
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
10511
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
10512
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
10513
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
10514
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
10515
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10));
10516
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
10517
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne0));
10518
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne1));
10519
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r2));
10520
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r3));
10521
+ #else
8415
10522
  kernel = backend_ctx->kernel_mul_mv_q6_K_f32;
8416
10523
 
8417
10524
  if (backend_ctx->gpu_family == INTEL) {
8418
- nth0 = 2;
8419
- nth1 = 16;
10525
+ nth0 = 16;
10526
+ nth1 = 2;
10527
+ ndst = 1;
8420
10528
  } else if (backend_ctx->gpu_family == ADRENO) {
8421
- nth0 = 2;
8422
- nth1 = 64;
10529
+ nth0 = 64;
10530
+ nth1 = 2;
10531
+ ndst = 1;
8423
10532
  } else {
8424
10533
  GGML_ASSERT(false && "TODO: Unknown GPU");
8425
10534
  }
@@ -8439,6 +10548,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
8439
10548
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
8440
10549
  CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
8441
10550
  CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
10551
+ #endif // GGML_OPENCL_SOA_Q
8442
10552
  break;
8443
10553
  case GGML_TYPE_MXFP4: {
8444
10554
  #ifdef GGML_OPENCL_SOA_Q
@@ -8535,13 +10645,16 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
8535
10645
 
8536
10646
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8537
10647
  } else if (src0t == GGML_TYPE_Q4_K) {
8538
- GGML_ASSERT(false && "not implemented");
10648
+ size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
10649
+ size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
10650
+
10651
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8539
10652
  } else if (src0t == GGML_TYPE_Q3_K) {
8540
10653
  GGML_ASSERT(false && "not implemented");
8541
10654
  } else if (src0t == GGML_TYPE_Q5_K) {
8542
10655
  GGML_ASSERT(false && "not implemented");
8543
10656
  } else if (src0t == GGML_TYPE_Q6_K) {
8544
- size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
10657
+ size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
8545
10658
  size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
8546
10659
 
8547
10660
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
@@ -8973,7 +11086,16 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
8973
11086
  cl_ulong offset0 = extra0->offset + src0->view_offs;
8974
11087
  cl_ulong offsetd = extrad->offset + dst->view_offs;
8975
11088
 
8976
- cl_kernel kernel = backend_ctx->kernel_scale;
11089
+ cl_kernel kernel;
11090
+
11091
+ int n = ggml_nelements(dst);
11092
+
11093
+ if (n % 4 == 0) {
11094
+ kernel = backend_ctx->kernel_scale_f32_4;
11095
+ n /= 4;
11096
+ } else {
11097
+ kernel = backend_ctx->kernel_scale_f32;
11098
+ }
8977
11099
 
8978
11100
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
8979
11101
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -8982,8 +11104,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
8982
11104
  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
8983
11105
  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
8984
11106
 
8985
- int n = ggml_nelements(dst)/4;
8986
-
8987
11107
  size_t global_work_size[] = {(size_t)n, 1, 1};
8988
11108
  size_t local_work_size[] = {64, 1, 1};
8989
11109
 
@@ -9005,28 +11125,13 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
9005
11125
  // GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
9006
11126
  UNUSED(dst);
9007
11127
 
9008
- const int ne00 = src0 ? src0->ne[0] : 0;
9009
- const int ne01 = src0 ? src0->ne[1] : 0;
9010
- const int ne02 = src0 ? src0->ne[2] : 0;
9011
- const int ne03 = src0 ? src0->ne[3] : 0;
9012
-
9013
- const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
9014
- const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
9015
- const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
9016
- const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
9017
-
9018
- const int ne10 = src1 ? src1->ne[0] : 0;
9019
- const int ne11 = src1 ? src1->ne[1] : 0;
9020
- const int ne12 = src1 ? src1->ne[2] : 0;
9021
- const int ne13 = src1 ? src1->ne[3] : 0;
9022
-
9023
- const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
9024
- const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
9025
- const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
9026
- const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
11128
+ GGML_TENSOR_LOCALS(int, ne0, src0, ne);
11129
+ GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
11130
+ GGML_TENSOR_LOCALS(int, ne1, src1, ne);
11131
+ GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
9027
11132
 
9028
- const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
9029
- const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
11133
+ const enum ggml_type src0t = src0->type;
11134
+ const enum ggml_type src1t = src1->type;
9030
11135
 
9031
11136
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
9032
11137
 
@@ -9063,6 +11168,15 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
9063
11168
  GGML_ASSERT(false && "not implemented");
9064
11169
  }
9065
11170
  break;
11171
+ case GGML_TYPE_I32:
11172
+ switch (src1t) {
11173
+ case GGML_TYPE_I32:
11174
+ kernel = backend_ctx->kernel_cpy_i32_i32;
11175
+ break;
11176
+ default:
11177
+ GGML_ASSERT(false && "not implemented");
11178
+ }
11179
+ break;
9066
11180
  default:
9067
11181
  GGML_ASSERT(false && "not implemented");
9068
11182
  }
@@ -9101,6 +11215,89 @@ static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const
9101
11215
  UNUSED(src1);
9102
11216
  }
9103
11217
 
11218
+ static void ggml_cl_set(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
11219
+ GGML_ASSERT(src0);
11220
+ GGML_ASSERT(src0->extra);
11221
+ GGML_ASSERT(src1);
11222
+ GGML_ASSERT(src1->extra);
11223
+ GGML_ASSERT(dst);
11224
+ GGML_ASSERT(dst->extra);
11225
+
11226
+ GGML_ASSERT((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32) &&
11227
+ src1->type == src0->type && dst->type == src0->type);
11228
+
11229
+ GGML_TENSOR_LOCALS(int, ne0, src0, ne);
11230
+ GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
11231
+ GGML_TENSOR_LOCALS(int, ne1, src1, ne);
11232
+ GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
11233
+ GGML_TENSOR_LOCALS(int, ne, dst, ne);
11234
+ GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
11235
+
11236
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
11237
+
11238
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
11239
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
11240
+
11241
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
11242
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
11243
+
11244
+ const cl_ulong pnb1 = ((const int32_t *)dst->op_params)[0];
11245
+ const cl_ulong pnb2 = ((const int32_t *)dst->op_params)[1];
11246
+ const cl_ulong pnb3 = ((const int32_t *)dst->op_params)[2];
11247
+ const cl_ulong offs = ((const int32_t *)dst->op_params)[3];
11248
+ const bool inplace = (bool)((const int32_t *)dst->op_params)[4];
11249
+
11250
+ cl_kernel kernel = nullptr;
11251
+
11252
+ // for inplace case, dst is a view of src0 and is updated on top of it
11253
+ // so for non-inplace case, copy src0 to dst first
11254
+ if (!inplace) {
11255
+ ggml_cl_cpy(backend, src0, dst, nullptr);
11256
+ }
11257
+
11258
+ // then copy src1 to dst with specified offset
11259
+ if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
11260
+ kernel = backend_ctx->kernel_cpy_f32_f32;
11261
+ } else if (src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
11262
+ kernel = backend_ctx->kernel_cpy_i32_i32;
11263
+ } else {
11264
+ GGML_ASSERT(false && "not implemented");
11265
+ }
11266
+
11267
+ offsetd += offs;
11268
+ cl_ulong nb = ggml_element_size(dst);
11269
+
11270
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra1->data_device));
11271
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset1));
11272
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
11273
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
11274
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne10));
11275
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne11));
11276
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne12));
11277
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne13));
11278
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb10));
11279
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb11));
11280
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb12));
11281
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb13));
11282
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
11283
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
11284
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
11285
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
11286
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb));
11287
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &pnb1));
11288
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &pnb2));
11289
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &pnb3));
11290
+
11291
+ int max_local_size = backend_ctx->get_kernel_workgroup_size(kernel);
11292
+
11293
+ const int nth = MIN(max_local_size, ne00);
11294
+
11295
+ size_t global_work_size[] = {(size_t)ne11*nth, (size_t)ne12, (size_t)ne13};
11296
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
11297
+
11298
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
11299
+ }
11300
+
9104
11301
  static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9105
11302
  GGML_ASSERT(src0);
9106
11303
  GGML_ASSERT(src0->extra);
@@ -9163,6 +11360,49 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
9163
11360
  }
9164
11361
  }
9165
11362
 
11363
+ static void ggml_cl_diag(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
11364
+ GGML_ASSERT(src0);
11365
+ GGML_ASSERT(src0->extra);
11366
+ GGML_ASSERT(dst);
11367
+ GGML_ASSERT(dst->extra);
11368
+
11369
+ UNUSED(src1);
11370
+
11371
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
11372
+
11373
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
11374
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
11375
+
11376
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
11377
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
11378
+
11379
+ GGML_TENSOR_LOCALS(int, ne0, src0, ne);
11380
+ GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
11381
+ GGML_TENSOR_LOCALS(int, ne, dst, ne);
11382
+ GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
11383
+
11384
+ cl_kernel kernel = backend_ctx->kernel_diag_f32;
11385
+
11386
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
11387
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
11388
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
11389
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
11390
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb01));
11391
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb02));
11392
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb03));
11393
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne0));
11394
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb0));
11395
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb2));
11396
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb3));
11397
+
11398
+ int nth = 64;
11399
+
11400
+ size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
11401
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
11402
+
11403
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
11404
+ }
11405
+
9166
11406
  static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9167
11407
  GGML_ASSERT(src0);
9168
11408
  GGML_ASSERT(src0->extra);
@@ -9474,6 +11714,72 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
9474
11714
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9475
11715
  }
9476
11716
 
11717
+ static void ggml_cl_solve_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
11718
+ GGML_ASSERT(src0);
11719
+ GGML_ASSERT(src0->extra);
11720
+ GGML_ASSERT(src1);
11721
+ GGML_ASSERT(src1->extra);
11722
+ GGML_ASSERT(dst);
11723
+ GGML_ASSERT(dst->extra);
11724
+
11725
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
11726
+
11727
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
11728
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
11729
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
11730
+
11731
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
11732
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
11733
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
11734
+
11735
+ cl_kernel kernel = backend_ctx->kernel_solve_tri_f32;
11736
+ GGML_ASSERT(kernel != nullptr);
11737
+
11738
+ const int n = src0->ne[0];
11739
+ const int k = src1->ne[0];
11740
+
11741
+ const cl_ulong nb00 = src0->nb[0];
11742
+ const cl_ulong nb01 = src0->nb[1];
11743
+ const cl_ulong nb02 = src0->nb[2];
11744
+ const cl_ulong nb03 = src0->nb[3];
11745
+
11746
+ const cl_ulong nb10 = src1->nb[0];
11747
+ const cl_ulong nb11 = src1->nb[1];
11748
+ const cl_ulong nb12 = src1->nb[2];
11749
+ const cl_ulong nb13 = src1->nb[3];
11750
+
11751
+ const cl_ulong nb0 = dst->nb[0];
11752
+ const cl_ulong nb1 = dst->nb[1];
11753
+ const cl_ulong nb2 = dst->nb[2];
11754
+ const cl_ulong nb3 = dst->nb[3];
11755
+
11756
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
11757
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
11758
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
11759
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
11760
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
11761
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
11762
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n));
11763
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &k));
11764
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
11765
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
11766
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
11767
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
11768
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong),&nb10));
11769
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong),&nb11));
11770
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong),&nb12));
11771
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong),&nb13));
11772
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb0));
11773
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb1));
11774
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb2));
11775
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb3));
11776
+
11777
+ size_t global_work_size[3]= { (size_t)k, (size_t)dst->ne[2], (size_t)dst->ne[3]};
11778
+ size_t local_work_size[] = {16, 4, 1};
11779
+
11780
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
11781
+ }
11782
+
9477
11783
  static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9478
11784
  GGML_ASSERT(src0);
9479
11785
  GGML_ASSERT(src1);
@@ -9611,7 +11917,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
9611
11917
  GGML_UNUSED(src1);
9612
11918
 
9613
11919
  GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
9614
- GGML_ASSERT(ggml_is_contiguous(src0));
9615
11920
 
9616
11921
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
9617
11922
 
@@ -9634,7 +11939,14 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
9634
11939
  const cl_ulong nb2 = dst->nb[2];
9635
11940
  const cl_ulong nb3 = dst->nb[3];
9636
11941
 
9637
- cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
11942
+ cl_kernel kernel;
11943
+
11944
+ const bool is_c4 = ne00 % 4 == 0;
11945
+ if (is_c4) {
11946
+ kernel = backend_ctx->kernel_sum_rows_f32_4;
11947
+ } else {
11948
+ kernel = backend_ctx->kernel_sum_rows_f32;
11949
+ }
9638
11950
 
9639
11951
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9640
11952
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -9651,12 +11963,124 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
9651
11963
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
9652
11964
  CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
9653
11965
 
9654
- size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
11966
+ size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
9655
11967
  size_t local_work_size[] = {(size_t)64, 1, 1};
9656
11968
 
9657
11969
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9658
11970
  }
9659
11971
 
11972
+ static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
11973
+ GGML_ASSERT(src0);
11974
+ GGML_ASSERT(src0->extra);
11975
+ GGML_ASSERT(dst);
11976
+ GGML_ASSERT(dst->extra);
11977
+ GGML_UNUSED(src1);
11978
+
11979
+ GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
11980
+ GGML_ASSERT(ggml_is_contiguous(src0));
11981
+
11982
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
11983
+
11984
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
11985
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
11986
+
11987
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
11988
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
11989
+
11990
+ GGML_TENSOR_LOCALS(int, ne0, src0, ne);
11991
+ GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
11992
+
11993
+ cl_kernel kernel = backend_ctx->kernel_cumsum_blk;
11994
+
11995
+ int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
11996
+ int nth = 1;
11997
+ while (nth < ne00 && 2*nth <= max_workgroup_size) {
11998
+ nth *= 2;
11999
+ }
12000
+
12001
+ GGML_ASSERT(ne00 <= nth*nth);
12002
+
12003
+ const int net0 = CEIL_DIV(ne00, nth);
12004
+ const int net1 = ne01;
12005
+ const int net2 = ne02;
12006
+ const int net3 = ne03;
12007
+
12008
+ const cl_ulong nbt0 = sizeof(float);
12009
+ const cl_ulong nbt1 = net0*nbt0;
12010
+ const cl_ulong nbt2 = net1*nbt1;
12011
+ const cl_ulong nbt3 = net2*nbt2;
12012
+
12013
+ static ggml_cl_buffer tmp_buffer;
12014
+ tmp_buffer.allocate(backend_ctx->context, net0*ne01*ne02*ne03*sizeof(float));
12015
+
12016
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
12017
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
12018
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &tmp_buffer.buffer));
12019
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
12020
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsetd));
12021
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00));
12022
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
12023
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne02));
12024
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne03));
12025
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
12026
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
12027
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
12028
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
12029
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &net0));
12030
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &net1));
12031
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &net2));
12032
+
12033
+ size_t global_work_size[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
12034
+ size_t local_work_size[] = { (size_t)nth, 1, 1};
12035
+
12036
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
12037
+
12038
+ if(ne00 > nth) {
12039
+ // if a single workgroup cannot handle an entire row, each workgroup
12040
+ // computes a partial sum and stores to dst, tmp_buffer contains the sum
12041
+ // of the each workgroup; cumsum this buffer and add to the partial sums in dst
12042
+ cl_ulong offsett = 0;
12043
+ kernel = backend_ctx->kernel_cumsum_blk;
12044
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &tmp_buffer.buffer));
12045
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offsett));
12046
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &tmp_buffer.buffer));
12047
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &tmp_buffer.buffer));
12048
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsett));
12049
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &net0));
12050
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
12051
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne02));
12052
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne03));
12053
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nbt0));
12054
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nbt1));
12055
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nbt2));
12056
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nbt3));
12057
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &net0));
12058
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &net1));
12059
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &net2));
12060
+
12061
+ size_t global_work_size_1[] = { (size_t)net1*nth, (size_t)net2, (size_t)net3};
12062
+ size_t local_work_size_1[] = { (size_t)nth, 1, 1};
12063
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_1, local_work_size_1, dst);
12064
+
12065
+ kernel = backend_ctx->kernel_cumsum_add;
12066
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &tmp_buffer.buffer));
12067
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extrad->data_device));
12068
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &offsetd));
12069
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
12070
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
12071
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne02));
12072
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne03));
12073
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &nbt0));
12074
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &nbt1));
12075
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &nbt2));
12076
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nbt3));
12077
+
12078
+ size_t global_work_size_2[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
12079
+ size_t local_work_size_2[] = { (size_t)nth, 1, 1};
12080
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_2, local_work_size_2, dst);
12081
+ }
12082
+ }
12083
+
9660
12084
  static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9661
12085
  GGML_ASSERT(src0);
9662
12086
  GGML_ASSERT(src0->extra);
@@ -9802,6 +12226,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
9802
12226
  }
9803
12227
  func = ggml_cl_cpy;
9804
12228
  break;
12229
+ case GGML_OP_SET:
12230
+ if (!any_on_device) {
12231
+ return false;
12232
+ }
12233
+ func = ggml_cl_set;
12234
+ break;
9805
12235
  case GGML_OP_DUP:
9806
12236
  case GGML_OP_CONT:
9807
12237
  if (!any_on_device) {
@@ -9901,6 +12331,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
9901
12331
  }
9902
12332
  func = ggml_cl_tanh;
9903
12333
  break;
12334
+ case GGML_UNARY_OP_NEG:
12335
+ if (!any_on_device) {
12336
+ return false;
12337
+ }
12338
+ func = ggml_cl_neg;
12339
+ break;
12340
+ case GGML_UNARY_OP_EXP:
12341
+ if (!any_on_device) {
12342
+ return false;
12343
+ }
12344
+ func = ggml_cl_exp;
12345
+ break;
9904
12346
  case GGML_UNARY_OP_EXPM1:
9905
12347
  if (!any_on_device) {
9906
12348
  return false;
@@ -9922,6 +12364,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
9922
12364
  }
9923
12365
  func = ggml_cl_glu;
9924
12366
  break;
12367
+ case GGML_OP_TRI:
12368
+ if (!any_on_device) {
12369
+ return false;
12370
+ }
12371
+ func = ggml_cl_tri;
12372
+ break;
9925
12373
  case GGML_OP_FILL:
9926
12374
  if (!any_on_device) {
9927
12375
  return false;
@@ -9946,6 +12394,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
9946
12394
  }
9947
12395
  func = ggml_cl_rms_norm;
9948
12396
  break;
12397
+ case GGML_OP_L2_NORM:
12398
+ if (!any_on_device) {
12399
+ return false;
12400
+ }
12401
+ func = ggml_cl_l2_norm;
12402
+ break;
9949
12403
  case GGML_OP_GROUP_NORM:
9950
12404
  if (!any_on_device) {
9951
12405
  return false;
@@ -10021,6 +12475,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
10021
12475
  }
10022
12476
  func = ggml_cl_nop;
10023
12477
  break;
12478
+ case GGML_OP_DIAG:
12479
+ if (!any_on_device) {
12480
+ return false;
12481
+ }
12482
+ func = ggml_cl_diag;
12483
+ break;
10024
12484
  case GGML_OP_DIAG_MASK_INF:
10025
12485
  if (!any_on_device) {
10026
12486
  return false;
@@ -10039,6 +12499,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
10039
12499
  }
10040
12500
  func = ggml_cl_rope;
10041
12501
  break;
12502
+ case GGML_OP_SOLVE_TRI:
12503
+ if (!any_on_device) {
12504
+ return false;
12505
+ }
12506
+ func = ggml_cl_solve_tri;
12507
+ break;
10042
12508
  case GGML_OP_IM2COL:
10043
12509
  if (!any_on_device) {
10044
12510
  return false;
@@ -10057,6 +12523,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
10057
12523
  }
10058
12524
  func = ggml_cl_sum_rows;
10059
12525
  break;
12526
+ case GGML_OP_CUMSUM:
12527
+ if (!any_on_device) {
12528
+ return false;
12529
+ }
12530
+ func = ggml_cl_cumsum;
12531
+ break;
10060
12532
  case GGML_OP_FLASH_ATTN_EXT:
10061
12533
  if (!any_on_device) {
10062
12534
  return false;