@novastera-oss/llamarn 0.2.9 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  5. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  15. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  17. package/cpp/build-info.cpp +2 -2
  18. package/cpp/llama.cpp/CMakeLists.txt +0 -1
  19. package/cpp/llama.cpp/README.md +4 -5
  20. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  21. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  22. package/cpp/llama.cpp/common/arg.cpp +17 -0
  23. package/cpp/llama.cpp/common/chat.cpp +37 -20
  24. package/cpp/llama.cpp/common/chat.h +2 -0
  25. package/cpp/llama.cpp/common/common.h +4 -0
  26. package/cpp/llama.cpp/convert_hf_to_gguf.py +745 -6
  27. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
  28. package/cpp/llama.cpp/ggml/CMakeLists.txt +7 -2
  29. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  30. package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
  33. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
  35. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1203 -163
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
  43. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +17 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  47. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +8 -6
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +185 -79
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
  66. package/cpp/llama.cpp/ggml/src/ggml-impl.h +64 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  68. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +35 -9
  69. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +167 -39
  70. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +254 -57
  71. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +505 -40
  73. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +60 -9
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +711 -292
  92. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
  93. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  94. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  95. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  105. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  106. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
  117. package/cpp/llama.cpp/ggml/src/ggml.c +382 -61
  118. package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
  119. package/cpp/llama.cpp/gguf-py/gguf/constants.py +209 -0
  120. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
  121. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +73 -21
  122. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
  123. package/cpp/llama.cpp/include/llama.h +0 -40
  124. package/cpp/llama.cpp/src/llama-arch.cpp +210 -3
  125. package/cpp/llama.cpp/src/llama-arch.h +18 -1
  126. package/cpp/llama.cpp/src/llama-batch.cpp +27 -1
  127. package/cpp/llama.cpp/src/llama-batch.h +8 -1
  128. package/cpp/llama.cpp/src/llama-chat.cpp +15 -0
  129. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  130. package/cpp/llama.cpp/src/llama-graph.cpp +119 -184
  131. package/cpp/llama.cpp/src/llama-graph.h +47 -60
  132. package/cpp/llama.cpp/src/llama-hparams.cpp +7 -1
  133. package/cpp/llama.cpp/src/llama-hparams.h +3 -0
  134. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  135. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  136. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  137. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  138. package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
  139. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  140. package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
  141. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +20 -10
  142. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  143. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  144. package/cpp/llama.cpp/src/llama-model.cpp +2530 -685
  145. package/cpp/llama.cpp/src/llama-model.h +18 -0
  146. package/cpp/llama.cpp/src/llama-quant.cpp +1 -0
  147. package/cpp/llama.cpp/src/llama-vocab.cpp +13 -2
  148. package/cpp/llama.cpp/src/llama-vocab.h +41 -0
  149. package/ios/include/chat.h +2 -0
  150. package/ios/include/common.h +4 -0
  151. package/ios/include/llama.h +0 -40
  152. package/ios/libs/llama.xcframework/Info.plist +19 -19
  153. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  154. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5055 -4886
  155. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  156. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
  157. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +0 -40
  158. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  159. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  160. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  161. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
  162. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  163. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  164. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  165. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  168. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3766
  169. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  170. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
  171. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -40
  172. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  173. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
  174. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -40
  175. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  176. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  177. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
  178. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -40
  179. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  180. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  181. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  182. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4890
  183. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  184. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
  185. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -40
  186. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  187. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  188. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  189. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
  190. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  191. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  192. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  193. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  194. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  195. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5091 -4922
  196. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  197. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
  198. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -40
  199. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  200. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  201. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4897
  202. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3794
  203. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  204. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  205. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  206. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  207. package/package.json +1 -1
  208. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  209. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  210. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  211. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  212. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  213. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  214. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  215. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  216. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  217. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  218. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  219. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  220. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  221. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  222. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  223. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  224. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  225. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  226. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  227. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  228. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  229. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  230. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  231. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  232. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  233. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  234. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  235. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  236. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  237. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  238. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  239. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  240. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  241. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  242. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  243. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  244. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  245. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  246. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  247. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -351,6 +351,8 @@ struct ggml_backend_opencl_context {
351
351
  cl_program program_gemv_noshuffle_general;
352
352
  cl_program program_gemv_noshuffle;
353
353
  cl_program program_get_rows;
354
+ cl_program program_set_rows;
355
+ cl_program program_glu;
354
356
  cl_program program_im2col_f16;
355
357
  cl_program program_im2col_f32;
356
358
  cl_program program_mul_mat_Ab_Bi_8x4;
@@ -366,6 +368,7 @@ struct ggml_backend_opencl_context {
366
368
  cl_program program_mul_mv_f16_f32;
367
369
  cl_program program_mul_mv_f32_f32;
368
370
  cl_program program_mul;
371
+ cl_program program_mul_mat_f16_f32_tiled;
369
372
  cl_program program_div;
370
373
  cl_program program_sub;
371
374
  cl_program program_norm;
@@ -397,10 +400,13 @@ struct ggml_backend_opencl_context {
397
400
  cl_kernel kernel_scale;
398
401
  cl_kernel kernel_silu, kernel_silu_4;
399
402
  cl_kernel kernel_gelu, kernel_gelu_4;
403
+ cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
400
404
  cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
401
405
  cl_kernel kernel_relu;
402
406
  cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
403
407
  cl_kernel kernel_clamp;
408
+ cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick,
409
+ kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
404
410
  cl_kernel kernel_norm;
405
411
  cl_kernel kernel_rms_norm;
406
412
  cl_kernel kernel_group_norm;
@@ -408,6 +414,7 @@ struct ggml_backend_opencl_context {
408
414
  cl_kernel kernel_soft_max, kernel_soft_max_4;
409
415
  cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
410
416
  cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
417
+ cl_kernel kernel_set_rows_f32, kernel_set_rows_f16;
411
418
  cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
412
419
  cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
413
420
  cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
@@ -416,6 +423,7 @@ struct ggml_backend_opencl_context {
416
423
  cl_kernel kernel_mul_mat_f16_f32_1row;
417
424
  cl_kernel kernel_mul_mat_f16_f32;
418
425
  cl_kernel kernel_mul_mat_f16_f32_l4;
426
+ cl_kernel kernel_mul_mat_f16_f32_tiled;
419
427
  cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
420
428
  cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
421
429
  cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
@@ -525,6 +533,16 @@ struct ggml_backend_opencl_context {
525
533
  fclose(ftrace);
526
534
  }
527
535
 
536
+ size_t get_kernel_workgroup_size(cl_kernel kernel) const {
537
+ size_t workgroup_size = 0;
538
+ size_t ret_size = 0;
539
+ CL_CHECK(
540
+ clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
541
+ sizeof(size_t), &workgroup_size, &ret_size));
542
+ GGML_ASSERT(sizeof(size_t) == ret_size);
543
+ return workgroup_size;
544
+ }
545
+
528
546
  void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
529
547
  #ifdef GGML_OPENCL_PROFILING
530
548
  cl_event evt;
@@ -733,11 +751,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
733
751
 
734
752
  CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
735
753
  CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
754
+ CL_CHECK((backend_ctx->kernel_gelu_erf = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf", &err), err));
755
+ CL_CHECK((backend_ctx->kernel_gelu_erf_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf_4", &err), err));
736
756
  CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
737
757
  CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
738
758
  GGML_LOG_CONT(".");
739
759
  }
740
760
 
761
+ // glu
762
+ {
763
+ #ifdef GGML_OPENCL_EMBED_KERNELS
764
+ const std::string kernel_src {
765
+ #include "glu.cl.h"
766
+ };
767
+ #else
768
+ const std::string kernel_src = read_file("glu.cl");
769
+ #endif
770
+ backend_ctx->program_glu =
771
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
772
+
773
+ CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
774
+ CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
775
+ CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
776
+ CL_CHECK((backend_ctx->kernel_geglu_erf = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf", &err), err));
777
+ CL_CHECK((backend_ctx->kernel_geglu_quick = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick", &err), err));
778
+ CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
779
+ CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
780
+ CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
781
+ CL_CHECK((backend_ctx->kernel_geglu_erf_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf_f16", &err), err));
782
+ CL_CHECK((backend_ctx->kernel_geglu_quick_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick_f16", &err), err));
783
+ GGML_LOG_CONT(".");
784
+ }
785
+
741
786
  // get_rows
742
787
  {
743
788
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -972,6 +1017,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
972
1017
  GGML_LOG_CONT(".");
973
1018
  }
974
1019
 
1020
+ // mul_mat_f16_f32_tiled
1021
+ {
1022
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1023
+ const std::string kernel_src {
1024
+ #include "mul_mat_f16_f32.cl.h"
1025
+ };
1026
+ #else
1027
+ const std::string kernel_src = read_file("mul_mat_f16_f32.cl");
1028
+ #endif
1029
+ backend_ctx->program_mul_mat_f16_f32_tiled =
1030
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1031
+
1032
+ CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_tiled, "mul_mat_f16_f32", &err), err));
1033
+ GGML_LOG_CONT(".");
1034
+ }
1035
+
975
1036
  // mul
976
1037
  {
977
1038
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1400,6 +1461,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1400
1461
  }
1401
1462
  }
1402
1463
 
1464
+ // set_rows
1465
+ {
1466
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1467
+ const std::string kernel_src {
1468
+ #include "set_rows.cl.h"
1469
+ };
1470
+ #else
1471
+ const std::string kernel_src = read_file("set_rows.cl");
1472
+ #endif
1473
+ backend_ctx->program_set_rows =
1474
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1475
+
1476
+ CL_CHECK((backend_ctx->kernel_set_rows_f32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32", &err), err));
1477
+ CL_CHECK((backend_ctx->kernel_set_rows_f16 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16", &err), err));
1478
+ GGML_LOG_CONT(".");
1479
+ }
1480
+
1403
1481
  // mul_mv_id_q4_0_f32_8x_flat
1404
1482
  {
1405
1483
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2163,7 +2241,7 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
2163
2241
  // dependencies.
2164
2242
  sync_with_other_backends(backend);
2165
2243
 
2166
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2244
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2167
2245
  continue;
2168
2246
  }
2169
2247
 
@@ -2198,6 +2276,21 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2198
2276
  default:
2199
2277
  return false;
2200
2278
  }
2279
+ case GGML_OP_SET_ROWS:
2280
+ {
2281
+ // TODO: add support
2282
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14274
2283
+ if (op->src[0]->type != GGML_TYPE_F32) {
2284
+ return false;
2285
+ }
2286
+ switch (op->type) {
2287
+ case GGML_TYPE_F16:
2288
+ case GGML_TYPE_F32:
2289
+ return true;
2290
+ default:
2291
+ return false;
2292
+ }
2293
+ }
2201
2294
  case GGML_OP_CPY:
2202
2295
  case GGML_OP_DUP:
2203
2296
  case GGML_OP_CONT:
@@ -2232,6 +2325,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2232
2325
  case GGML_UNARY_OP_GELU:
2233
2326
  case GGML_UNARY_OP_SILU:
2234
2327
  case GGML_UNARY_OP_RELU:
2328
+ case GGML_UNARY_OP_GELU_ERF:
2235
2329
  case GGML_UNARY_OP_GELU_QUICK:
2236
2330
  return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
2237
2331
  case GGML_UNARY_OP_SIGMOID:
@@ -2242,6 +2336,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2242
2336
  default:
2243
2337
  return false;
2244
2338
  }
2339
+ case GGML_OP_GLU:
2340
+ switch (ggml_get_glu_op(op)) {
2341
+ case GGML_GLU_OP_GEGLU:
2342
+ case GGML_GLU_OP_REGLU:
2343
+ case GGML_GLU_OP_SWIGLU:
2344
+ case GGML_GLU_OP_GEGLU_ERF:
2345
+ case GGML_GLU_OP_GEGLU_QUICK:
2346
+ return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
2347
+ default:
2348
+ return false;
2349
+ }
2245
2350
  case GGML_OP_CLAMP:
2246
2351
  return op->src[0]->type == GGML_TYPE_F32;
2247
2352
  case GGML_OP_SOFT_MAX:
@@ -3166,7 +3271,7 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
3166
3271
 
3167
3272
  // Open file and dump.
3168
3273
  char fname[512];
3169
- sprintf(fname, "./tensor-dumps/%s.txt", tensor->name);
3274
+ snprintf(fname, sizeof(fname), "./tensor-dumps/%s.txt", tensor->name);
3170
3275
  FILE * f = fopen(fname, "w");
3171
3276
  if (!f) {
3172
3277
  printf("Failed to open %s\n", fname);
@@ -3325,6 +3430,111 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3325
3430
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3326
3431
  }
3327
3432
 
3433
+ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3434
+ GGML_ASSERT(src0);
3435
+ GGML_ASSERT(src0->extra);
3436
+ GGML_ASSERT(src1);
3437
+ GGML_ASSERT(src1->extra);
3438
+ GGML_ASSERT(dst);
3439
+ GGML_ASSERT(dst->extra);
3440
+
3441
+ // ne0 = ne00
3442
+ // ne2 = ne02
3443
+ // ne3 = ne03
3444
+
3445
+ const int ne01 = src0->ne[1];
3446
+ const int ne02 = src0->ne[2];
3447
+ const int ne03 = src0->ne[3];
3448
+
3449
+ const cl_ulong nb01 = src0->nb[1];
3450
+ const cl_ulong nb02 = src0->nb[2];
3451
+ const cl_ulong nb03 = src0->nb[3];
3452
+
3453
+ const int ne11 = src1->ne[1];
3454
+ const int ne12 = src1->ne[2];
3455
+
3456
+ const cl_ulong nb10 = src1->nb[0];
3457
+ const cl_ulong nb11 = src1->nb[1];
3458
+ const cl_ulong nb12 = src1->nb[2];
3459
+
3460
+ const int ne0 = dst->ne[0];
3461
+
3462
+ const cl_ulong nb1 = dst->nb[1];
3463
+ const cl_ulong nb2 = dst->nb[2];
3464
+ const cl_ulong nb3 = dst->nb[3];
3465
+
3466
+ const int nblk0 = ne0/ggml_blck_size(dst->type);
3467
+
3468
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3469
+
3470
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3471
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
3472
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3473
+
3474
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
3475
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
3476
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
3477
+
3478
+ cl_kernel kernel;
3479
+
3480
+ switch (dst->type) {
3481
+ case GGML_TYPE_F32:
3482
+ kernel = backend_ctx->kernel_set_rows_f32;
3483
+ break;
3484
+ case GGML_TYPE_F16:
3485
+ kernel = backend_ctx->kernel_set_rows_f16;
3486
+ break;
3487
+ default:
3488
+ GGML_ABORT("not implemented");
3489
+ }
3490
+
3491
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3492
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3493
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
3494
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
3495
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
3496
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
3497
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
3498
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3499
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3500
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
3501
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
3502
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
3503
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
3504
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
3505
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
3506
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &nblk0));
3507
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1));
3508
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2));
3509
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3));
3510
+
3511
+ int nth0 = 64;
3512
+ if (backend_ctx->gpu_family == INTEL) {
3513
+ nth0 = 32;
3514
+ } else if (backend_ctx->gpu_family == ADRENO) {
3515
+ nth0 = 64;
3516
+ }
3517
+
3518
+ int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
3519
+ while (nth0 < nblk0 && nth0 < max_workgroup_size) {
3520
+ nth0 *= 2;
3521
+ }
3522
+
3523
+ int rows_per_workgroup = 1;
3524
+ if (nth0 > nblk0) {
3525
+ rows_per_workgroup = nth0 / nblk0;
3526
+ nth0 = nblk0;
3527
+ }
3528
+
3529
+ size_t global_work_size[] = {
3530
+ (size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0,
3531
+ (size_t)ne02*rows_per_workgroup,
3532
+ (size_t)ne03};
3533
+ size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
3534
+
3535
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3536
+ }
3537
+
3328
3538
  static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3329
3539
  GGML_ASSERT(src0);
3330
3540
  GGML_ASSERT(src0->extra);
@@ -3825,6 +4035,44 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
3825
4035
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3826
4036
  }
3827
4037
 
4038
+ static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4039
+ GGML_ASSERT(src0);
4040
+ GGML_ASSERT(src0->extra);
4041
+ GGML_ASSERT(dst);
4042
+ GGML_ASSERT(dst->extra);
4043
+
4044
+ UNUSED(src1);
4045
+
4046
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4047
+
4048
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4049
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4050
+
4051
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
4052
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
4053
+
4054
+ cl_kernel kernel;
4055
+
4056
+ int n = ggml_nelements(dst);
4057
+
4058
+ if (n % 4 == 0) {
4059
+ kernel = backend_ctx->kernel_gelu_erf_4;
4060
+ n /= 4;
4061
+ } else {
4062
+ kernel = backend_ctx->kernel_gelu_erf;
4063
+ }
4064
+
4065
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
4066
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
4067
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4068
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
4069
+
4070
+ size_t global_work_size[] = {(size_t)n, 1, 1};
4071
+ size_t local_work_size[] = {64, 1, 1};
4072
+
4073
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
4074
+ }
4075
+
3828
4076
  static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3829
4077
  GGML_ASSERT(src0);
3830
4078
  GGML_ASSERT(src0->extra);
@@ -4420,7 +4668,8 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4420
4668
 
4421
4669
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4422
4670
 
4423
- const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
4671
+ const int mode_flags = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
4672
+ const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
4424
4673
  cl_kernel kernel = nullptr;
4425
4674
 
4426
4675
  if (mode == GGML_SCALE_MODE_NEAREST) {
@@ -4451,18 +4700,22 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4451
4700
  const cl_ulong nb02 = src0->nb[2];
4452
4701
  const cl_ulong nb03 = src0->nb[3];
4453
4702
 
4454
- const int ne00_src = src0->ne[0];
4455
- const int ne01_src = src0->ne[1];
4703
+ const int ne00 = src0->ne[0];
4704
+ const int ne01 = src0->ne[1];
4705
+ const int ne02 = src0->ne[2];
4706
+ const int ne03 = src0->ne[3];
4707
+
4708
+ const int ne0 = dst->ne[0];
4709
+ const int ne1 = dst->ne[1];
4710
+ const int ne2 = dst->ne[2];
4711
+ const int ne3 = dst->ne[3];
4456
4712
 
4457
- const int ne10_dst = dst->ne[0];
4458
- const int ne11_dst = dst->ne[1];
4459
- const int ne12_dst = dst->ne[2];
4460
- const int ne13_dst = dst->ne[3];
4713
+ float sf0 = (float)ne0 / ne00;
4714
+ float sf1 = (float)ne1 / ne01;
4715
+ float sf2 = (float)ne2 / ne02;
4716
+ float sf3 = (float)ne3 / ne03;
4461
4717
 
4462
- const float sf0 = (float)dst->ne[0] / src0->ne[0];
4463
- const float sf1 = (float)dst->ne[1] / src0->ne[1];
4464
- const float sf2 = (float)dst->ne[2] / src0->ne[2];
4465
- const float sf3 = (float)dst->ne[3] / src0->ne[3];
4718
+ float pixel_offset = 0.5f;
4466
4719
 
4467
4720
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4468
4721
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
@@ -4474,29 +4727,36 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
4474
4727
  CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
4475
4728
 
4476
4729
  if (mode == GGML_SCALE_MODE_NEAREST) {
4477
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst));
4478
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst));
4479
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst));
4480
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst));
4730
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
4731
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne1));
4732
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne2));
4733
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne3));
4481
4734
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
4482
4735
  CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
4483
4736
  CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
4484
4737
  CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
4485
4738
  } else if (mode == GGML_SCALE_MODE_BILINEAR) {
4486
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src));
4487
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src));
4488
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst));
4489
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst));
4490
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst));
4491
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst));
4739
+ if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
4740
+ sf0 = (float)(ne0 - 1) / (ne00 - 1);
4741
+ sf1 = (float)(ne1 - 1) / (ne01 - 1);
4742
+ pixel_offset = 0.0f;
4743
+ }
4744
+
4745
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
4746
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
4747
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne0));
4748
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne1));
4749
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne2));
4750
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne3));
4492
4751
  CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
4493
4752
  CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
4494
4753
  CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
4495
4754
  CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
4755
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &pixel_offset));
4496
4756
  }
4497
4757
 
4498
4758
 
4499
- size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
4759
+ size_t dst_total_elements = (size_t)ne0 * ne1 * ne2 * ne3;
4500
4760
  if (dst_total_elements == 0) {
4501
4761
  return;
4502
4762
  }
@@ -4685,6 +4945,58 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
4685
4945
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
4686
4946
  }
4687
4947
 
4948
+ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4949
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4950
+
4951
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4952
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
4953
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4954
+
4955
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
4956
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
4957
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
4958
+
4959
+ const int M = src0->ne[1];
4960
+ const int N = src1->ne[1];
4961
+ const int K = src0->ne[0];
4962
+
4963
+ cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled;
4964
+
4965
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M));
4966
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N));
4967
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K));
4968
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0->data_device));
4969
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset0));
4970
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra1->data_device));
4971
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offset1));
4972
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), &extrad->data_device));
4973
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
4974
+
4975
+ // Tiling parameters. These need to be tuned for optimal performance.
4976
+ // They must match the #defines in the kernel mul_mat_f16_f32.cl.
4977
+ //
4978
+ // OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN.
4979
+ // TPWM / TPWN: Threads per Work-group. This is the work-group size.
4980
+ // OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements.
4981
+ //
4982
+ // The following relationships must hold:
4983
+ // OPWM = TPWM * OPTM
4984
+ // OPWN = TPWN * OPTN
4985
+ //
4986
+ const int OPWM = 64;
4987
+ const int OPWN = 64;
4988
+ const int TPWM = 16;
4989
+ const int TPWN = 8;
4990
+
4991
+ size_t local_work_size[2] = { TPWM, TPWN };
4992
+ size_t global_work_size[2] = {
4993
+ (size_t) ((M + OPWM - 1) / OPWM) * TPWM,
4994
+ (size_t) ((N + OPWN - 1) / OPWN) * TPWN,
4995
+ };
4996
+
4997
+ backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
4998
+ }
4999
+
4688
5000
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4689
5001
  GGML_ASSERT(src0);
4690
5002
  GGML_ASSERT(src0->extra);
@@ -4698,6 +5010,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4698
5010
 
4699
5011
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4700
5012
 
5013
+ if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
5014
+ src0->ne[1] > 32 && // M > 32
5015
+ src1->ne[1] > 32 && // N > 32
5016
+ src0->ne[0] > 32 && // K > 32
5017
+ src0->ne[2] == 1 && src0->ne[3] == 1 &&
5018
+ src1->ne[2] == 1 && src1->ne[3] == 1 &&
5019
+ ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
5020
+ backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
5021
+ ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
5022
+ return;
5023
+ }
5024
+
4701
5025
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4702
5026
  ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
4703
5027
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5488,7 +5812,9 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
5488
5812
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5489
5813
 
5490
5814
  float scale;
5491
- memcpy(&scale, dst->op_params, sizeof(scale));
5815
+ float bias;
5816
+ memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float));
5817
+ memcpy(&bias, ((int32_t *) dst->op_params) + 1, sizeof(float));
5492
5818
 
5493
5819
  ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5494
5820
  ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5503,6 +5829,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
5503
5829
  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
5504
5830
  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
5505
5831
  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
5832
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
5506
5833
 
5507
5834
  int n = ggml_nelements(dst)/4;
5508
5835
 
@@ -5712,19 +6039,31 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
5712
6039
 
5713
6040
  cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
5714
6041
 
5715
- const int ne00 = src0 ? src0->ne[0] : 0;
5716
- const int ne01 = src0 ? src0->ne[1] : 0;
5717
- const int ne02 = src0 ? src0->ne[2] : 0;
5718
- const int ne03 = src0 ? src0->ne[3] : 0;
6042
+ const int ne00 = src0->ne[0];
6043
+ const int ne01 = src0->ne[1];
6044
+ const int ne02 = src0->ne[2];
6045
+ const int ne03 = src0->ne[3];
6046
+
6047
+ const cl_long nb01 = src0->nb[1];
6048
+ const cl_long nb02 = src0->nb[2];
6049
+ const cl_long nb03 = src0->nb[3];
6050
+
6051
+ const int ne12 = src1 ? src1->ne[2] : 0;
6052
+ const int ne13 = src1 ? src1->ne[3] : 0;
6053
+
6054
+ const cl_long nb11 = src1 ? src1->nb[1] : 0;
6055
+ const cl_long nb12 = src1 ? src1->nb[2] : 0;
6056
+ const cl_long nb13 = src1 ? src1->nb[3] : 0;
6057
+
6058
+ const cl_long nb1 = dst->nb[1];
6059
+ const cl_long nb2 = dst->nb[2];
6060
+ const cl_long nb3 = dst->nb[3];
5719
6061
 
5720
6062
  float scale, max_bias;
5721
6063
  memcpy(&scale, dst->op_params + 0, sizeof(float));
5722
6064
  memcpy(&max_bias, dst->op_params + 1, sizeof(float));
5723
6065
 
5724
- const int nrows_x = ggml_nrows(src0);
5725
- const int nrows_y = src0->ne[1];
5726
-
5727
- const int n_head = nrows_x/nrows_y;
6066
+ const int n_head = src0->ne[2];
5728
6067
  const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
5729
6068
 
5730
6069
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
@@ -5769,13 +6108,22 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
5769
6108
  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5770
6109
  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5771
6110
  CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
5772
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
5773
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
5774
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(float), &scale));
5775
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &max_bias));
5776
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &m0));
5777
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &m1));
5778
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &n_head_log2));
6111
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
6112
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
6113
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
6114
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
6115
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13));
6116
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
6117
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
6118
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
6119
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb1));
6120
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb2));
6121
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb3));
6122
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &scale));
6123
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(float), &max_bias));
6124
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &m0));
6125
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &m1));
6126
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &n_head_log2));
5779
6127
 
5780
6128
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5781
6129
  size_t local_work_size[] = {(size_t)nth, 1, 1};
@@ -6143,6 +6491,105 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
6143
6491
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6144
6492
  }
6145
6493
 
6494
+ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6495
+ GGML_ASSERT(src0);
6496
+ GGML_ASSERT(src0->extra);
6497
+ GGML_ASSERT(dst);
6498
+ GGML_ASSERT(dst->extra);
6499
+
6500
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
6501
+
6502
+ if (src1) {
6503
+ GGML_ASSERT(src1);
6504
+ GGML_ASSERT(src1->extra);
6505
+ GGML_ASSERT(ggml_are_same_shape(src0, src1));
6506
+ }
6507
+
6508
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6509
+
6510
+ cl_kernel kernel;
6511
+ switch (ggml_get_glu_op(dst)) {
6512
+ case GGML_GLU_OP_GEGLU:
6513
+ if (dst->type == GGML_TYPE_F32) {
6514
+ kernel = backend_ctx->kernel_geglu;
6515
+ } else {
6516
+ kernel = backend_ctx->kernel_geglu_f16;
6517
+ }
6518
+ break;
6519
+ case GGML_GLU_OP_REGLU:
6520
+ if (dst->type == GGML_TYPE_F32) {
6521
+ kernel = backend_ctx->kernel_reglu;
6522
+ } else {
6523
+ kernel = backend_ctx->kernel_reglu_f16;
6524
+ }
6525
+ break;
6526
+ case GGML_GLU_OP_SWIGLU:
6527
+ if (dst->type == GGML_TYPE_F32) {
6528
+ kernel = backend_ctx->kernel_swiglu;
6529
+ } else {
6530
+ kernel = backend_ctx->kernel_swiglu_f16;
6531
+ }
6532
+ break;
6533
+ case GGML_GLU_OP_GEGLU_ERF:
6534
+ if (dst->type == GGML_TYPE_F32) {
6535
+ kernel = backend_ctx->kernel_geglu_erf;
6536
+ } else {
6537
+ kernel = backend_ctx->kernel_geglu_erf_f16;
6538
+ }
6539
+ break;
6540
+ case GGML_GLU_OP_GEGLU_QUICK:
6541
+ if (dst->type == GGML_TYPE_F32) {
6542
+ kernel = backend_ctx->kernel_geglu_quick;
6543
+ } else {
6544
+ kernel = backend_ctx->kernel_geglu_quick_f16;
6545
+ }
6546
+ break;
6547
+ default:
6548
+ GGML_ABORT("Unsupported glu op");
6549
+ }
6550
+
6551
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6552
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6553
+
6554
+ ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
6555
+
6556
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
6557
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
6558
+
6559
+ cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
6560
+
6561
+ const int ne0 = dst->ne[0];
6562
+
6563
+ const cl_ulong nb01 = src0->nb[1];
6564
+ const cl_ulong nb11 = src1 ? src1->nb[1] : nb01;
6565
+
6566
+ const cl_ulong nb1 = dst->nb[1];
6567
+
6568
+ const int swp = ((const int32_t *) dst->op_params)[1];
6569
+ const int ne00_off = src1 ? 0 : (swp ? ne0 : 0);
6570
+ const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
6571
+
6572
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6573
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6574
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), src1 ? &extra1->data_device : &extra0->data_device));
6575
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6576
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6577
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6578
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
6579
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb11));
6580
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
6581
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb1));
6582
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off));
6583
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off));
6584
+
6585
+ const size_t nrows = ggml_nrows(src0);
6586
+ size_t nth = 512;
6587
+ size_t global_work_size[] = {nrows*nth, 1, 1};
6588
+ size_t local_work_size[] = {nth, 1, 1};
6589
+
6590
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6591
+ }
6592
+
6146
6593
  //------------------------------------------------------------------------------
6147
6594
  // Op offloading
6148
6595
  //------------------------------------------------------------------------------
@@ -6166,6 +6613,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
6166
6613
  }
6167
6614
  func = ggml_cl_get_rows;
6168
6615
  break;
6616
+ case GGML_OP_SET_ROWS:
6617
+ if (!any_on_device) {
6618
+ return false;
6619
+ }
6620
+ func = ggml_cl_set_rows;
6621
+ break;
6169
6622
  case GGML_OP_CPY:
6170
6623
  if (!any_on_device) {
6171
6624
  return false;
@@ -6211,6 +6664,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
6211
6664
  }
6212
6665
  func = ggml_cl_gelu;
6213
6666
  break;
6667
+ case GGML_UNARY_OP_GELU_ERF:
6668
+ if (!any_on_device) {
6669
+ return false;
6670
+ }
6671
+ func = ggml_cl_gelu_erf;
6672
+ break;
6214
6673
  case GGML_UNARY_OP_GELU_QUICK:
6215
6674
  if (!any_on_device) {
6216
6675
  return false;
@@ -6244,6 +6703,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
6244
6703
  default:
6245
6704
  return false;
6246
6705
  } break;
6706
+ case GGML_OP_GLU:
6707
+ if (!any_on_device) {
6708
+ return false;
6709
+ }
6710
+ func = ggml_cl_glu;
6711
+ break;
6247
6712
  case GGML_OP_CLAMP:
6248
6713
  if (!any_on_device) {
6249
6714
  return false;