@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (976) hide show
  1. package/RNLlamaCpp.podspec +3 -0
  2. package/android/CMakeLists.txt +2 -0
  3. package/android/src/main/cpp/include/llama.h +44 -21
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakeLists.txt +12 -0
  22. package/cpp/llama.cpp/CODEOWNERS +116 -10
  23. package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
  24. package/cpp/llama.cpp/README.md +13 -5
  25. package/cpp/llama.cpp/build-xcframework.sh +5 -0
  26. package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  27. package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
  28. package/cpp/llama.cpp/common/arg.cpp +303 -795
  29. package/cpp/llama.cpp/common/arg.h +2 -3
  30. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  31. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  32. package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
  33. package/cpp/llama.cpp/common/chat-parser.h +13 -0
  34. package/cpp/llama.cpp/common/chat.cpp +1147 -88
  35. package/cpp/llama.cpp/common/chat.h +16 -3
  36. package/cpp/llama.cpp/common/common.cpp +70 -15
  37. package/cpp/llama.cpp/common/common.h +57 -19
  38. package/cpp/llama.cpp/common/download.cpp +1072 -0
  39. package/cpp/llama.cpp/common/download.h +55 -0
  40. package/cpp/llama.cpp/common/http.h +73 -0
  41. package/cpp/llama.cpp/common/json-partial.cpp +70 -2
  42. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
  43. package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
  44. package/cpp/llama.cpp/common/log.cpp +59 -2
  45. package/cpp/llama.cpp/common/log.h +12 -4
  46. package/cpp/llama.cpp/common/sampling.cpp +84 -8
  47. package/cpp/llama.cpp/common/sampling.h +3 -1
  48. package/cpp/llama.cpp/common/speculative.cpp +1 -1
  49. package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
  50. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
  51. package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
  52. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
  53. package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
  54. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  55. package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  56. package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
  57. package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
  58. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
  59. package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
  60. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
  62. package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
  64. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
  65. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  68. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
  69. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
  70. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
  71. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
  72. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
  87. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
  88. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
  89. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
  90. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
  91. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
  92. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
  93. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  94. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
  100. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
  101. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
  102. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  144. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  152. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  167. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  173. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  174. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  176. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  178. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  179. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  180. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  183. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  184. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  186. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  187. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  188. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  189. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  195. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  196. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  197. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  198. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  199. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  201. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  202. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  203. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  204. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
  207. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
  208. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
  210. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
  211. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
  212. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  216. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  217. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
  218. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
  219. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
  220. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
  225. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  226. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
  227. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
  228. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
  229. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
  230. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  231. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
  232. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  233. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
  234. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  235. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  236. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
  237. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
  238. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  239. package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
  240. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
  241. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  242. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  243. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  244. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
  245. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
  246. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
  247. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
  248. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
  249. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
  250. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
  251. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
  252. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
  253. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  254. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
  255. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
  256. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
  257. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
  258. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
  259. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
  260. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  261. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  262. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  263. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  264. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  265. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  266. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  267. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  268. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  269. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  270. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  271. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  272. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  273. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  274. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  275. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  276. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
  277. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  278. package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
  279. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
  280. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  281. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  282. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
  283. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
  284. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
  285. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
  286. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  287. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  288. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
  289. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  290. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
  291. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
  292. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
  293. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
  294. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
  295. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  296. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  297. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
  298. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  299. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
  300. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
  301. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
  302. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
  303. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
  304. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
  305. package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  306. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  307. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  308. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
  309. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
  310. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
  311. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
  312. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
  313. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
  314. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
  315. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
  316. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  317. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  318. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  319. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
  320. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  321. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
  322. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  323. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  324. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  325. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  326. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  327. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  328. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  329. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  330. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  331. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  332. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  333. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  334. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  335. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  336. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
  337. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  338. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  339. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  340. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
  341. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  342. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  343. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  344. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  345. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
  346. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  347. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  348. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  349. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  350. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  351. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  352. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  353. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  354. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  355. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  356. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  357. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  358. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  359. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  360. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  361. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  362. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  363. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  364. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  365. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  366. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  367. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  368. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  369. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  370. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
  371. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  372. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
  373. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
  374. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
  375. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
  376. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
  377. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  378. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  379. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  380. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  381. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  382. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  383. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  384. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
  385. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  386. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  387. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  388. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  389. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  390. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  391. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
  392. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  393. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  394. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  395. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  396. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  397. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
  398. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
  399. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
  400. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
  401. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
  402. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
  403. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
  404. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
  405. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
  406. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
  407. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  408. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  409. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
  410. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
  411. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
  412. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
  413. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
  414. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  415. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
  416. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
  417. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
  418. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
  419. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
  420. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
  421. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  422. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  423. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  424. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  425. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  426. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  427. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
  428. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  429. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
  430. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  431. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  432. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  433. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  434. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
  435. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  436. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  437. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  438. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
  439. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  440. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
  441. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
  442. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
  443. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
  444. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
  445. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  446. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  447. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  448. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  449. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  450. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  451. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  452. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  453. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  454. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  455. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  456. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  457. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
  458. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  459. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  460. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
  461. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  462. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  463. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  464. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  465. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
  466. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  467. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
  468. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
  469. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
  470. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
  471. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
  472. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  473. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  474. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  475. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  476. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
  477. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  478. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  479. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
  480. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  481. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  482. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  483. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  484. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  485. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  486. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  487. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
  488. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  489. package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  490. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
  491. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  492. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  493. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  494. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  495. package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
  496. package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
  497. package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
  498. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
  499. package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
  500. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
  501. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
  502. package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
  503. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
  504. package/cpp/llama.cpp/include/llama.h +44 -21
  505. package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
  506. package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
  507. package/cpp/llama.cpp/media/llama1-icon.png +0 -0
  508. package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
  509. package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
  510. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
  511. package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
  512. package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
  513. package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
  514. package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
  515. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  516. package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
  517. package/cpp/llama.cpp/src/llama-arch.h +50 -0
  518. package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
  519. package/cpp/llama.cpp/src/llama-batch.h +13 -2
  520. package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
  521. package/cpp/llama.cpp/src/llama-chat.h +4 -0
  522. package/cpp/llama.cpp/src/llama-context.cpp +300 -45
  523. package/cpp/llama.cpp/src/llama-context.h +16 -6
  524. package/cpp/llama.cpp/src/llama-cparams.h +2 -1
  525. package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
  526. package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
  527. package/cpp/llama.cpp/src/llama-graph.h +27 -5
  528. package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
  529. package/cpp/llama.cpp/src/llama-hparams.h +48 -8
  530. package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
  531. package/cpp/llama.cpp/src/llama-impl.h +2 -0
  532. package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
  533. package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  534. package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
  535. package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
  536. package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
  537. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  538. package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
  539. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
  540. package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
  541. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  542. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  543. package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
  544. package/cpp/llama.cpp/src/llama-model.h +40 -4
  545. package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
  546. package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
  547. package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
  548. package/cpp/llama.cpp/src/llama-vocab.h +43 -39
  549. package/cpp/llama.cpp/src/llama.cpp +69 -10
  550. package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
  551. package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
  552. package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
  553. package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
  554. package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
  555. package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
  556. package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
  557. package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  558. package/cpp/llama.cpp/src/models/bert.cpp +176 -0
  559. package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
  560. package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
  561. package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
  562. package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
  563. package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
  564. package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
  565. package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  566. package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
  567. package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
  568. package/cpp/llama.cpp/src/models/deci.cpp +135 -0
  569. package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
  570. package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
  571. package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
  572. package/cpp/llama.cpp/src/models/dream.cpp +105 -0
  573. package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  574. package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
  575. package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
  576. package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
  577. package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
  578. package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
  579. package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  580. package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
  581. package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  582. package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  583. package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  584. package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
  585. package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
  586. package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
  587. package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
  588. package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  589. package/cpp/llama.cpp/src/models/granite.cpp +211 -0
  590. package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  591. package/cpp/llama.cpp/src/models/grok.cpp +159 -0
  592. package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
  593. package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  594. package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  595. package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
  596. package/cpp/llama.cpp/src/models/jais.cpp +86 -0
  597. package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
  598. package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
  599. package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
  600. package/cpp/llama.cpp/src/models/llada.cpp +99 -0
  601. package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
  602. package/cpp/llama.cpp/src/models/llama.cpp +155 -0
  603. package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
  604. package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
  605. package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
  606. package/cpp/llama.cpp/src/models/models.h +485 -0
  607. package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
  608. package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
  609. package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
  610. package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
  611. package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
  612. package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
  613. package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
  614. package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  615. package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
  616. package/cpp/llama.cpp/src/models/orion.cpp +123 -0
  617. package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  618. package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
  619. package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
  620. package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
  621. package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
  622. package/cpp/llama.cpp/src/models/plm.cpp +168 -0
  623. package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
  624. package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
  625. package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
  626. package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
  627. package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
  628. package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
  629. package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  630. package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
  631. package/cpp/llama.cpp/src/models/refact.cpp +94 -0
  632. package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  633. package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
  634. package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  635. package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  636. package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
  637. package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
  638. package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
  639. package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
  640. package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
  641. package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
  642. package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
  643. package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
  644. package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
  645. package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  646. package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
  647. package/cpp/llama.cpp/src/unicode.cpp +77 -0
  648. package/cpp/llama.cpp/src/unicode.h +43 -0
  649. package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
  650. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
  651. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
  652. package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
  653. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
  654. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
  655. package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
  656. package/ios/include/chat.h +16 -3
  657. package/ios/include/common/minja/chat-template.hpp +9 -2
  658. package/ios/include/common/minja/minja.hpp +101 -22
  659. package/ios/include/common.h +57 -19
  660. package/ios/include/json-schema-to-grammar.h +2 -0
  661. package/ios/include/llama.h +44 -21
  662. package/ios/include/log.h +12 -4
  663. package/ios/include/sampling.h +3 -1
  664. package/ios/libs/llama.xcframework/Info.plist +20 -20
  665. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  666. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
  667. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
  668. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
  669. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
  670. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
  671. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
  672. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  673. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  674. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
  675. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
  676. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
  677. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
  678. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
  679. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
  680. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
  681. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  682. package/package.json +10 -4
  683. package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
  684. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
  685. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  686. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
  687. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  688. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
  689. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
  690. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  691. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  692. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  693. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  694. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  695. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  696. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  697. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  698. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  699. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  700. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  701. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  702. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  703. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  704. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  705. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  706. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  707. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  708. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  709. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  710. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  711. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  712. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  713. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  714. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  715. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  716. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  717. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  718. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  719. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  720. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  721. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  722. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  723. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  724. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  725. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  726. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  727. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  728. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  729. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  730. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  731. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  732. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  733. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  734. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  735. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  736. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  737. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  738. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  739. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  740. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  741. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  742. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  743. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  744. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  745. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  746. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  747. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  748. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  749. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  750. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  751. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  752. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  753. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  754. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  755. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  756. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  757. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  758. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  759. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  760. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  761. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  762. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  763. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  764. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  765. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  766. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  767. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  768. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  769. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  770. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  771. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  772. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  773. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  774. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  775. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  776. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
  777. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
  778. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  779. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  780. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  781. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
  782. package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  783. package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  784. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  785. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  786. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  787. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  788. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  789. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  790. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  791. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  792. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  793. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  794. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  795. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  796. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  797. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  798. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  799. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  800. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  801. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  802. package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  803. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  804. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  805. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  806. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  807. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  808. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  809. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  810. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  811. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  812. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  813. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  814. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  815. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  816. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  817. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  818. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  819. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  820. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  821. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  822. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  823. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  824. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  825. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
  826. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
  827. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
  828. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
  829. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
  830. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
  831. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
  832. package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
  833. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
  834. package/cpp/llama.cpp/models/templates/README.md +0 -25
  835. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
  836. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
  837. package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
  838. package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
  839. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
  840. package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
  841. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
  842. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
  843. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
  844. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
  845. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
  846. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
  847. package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
  848. package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
  849. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
  850. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
  851. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
  852. package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
  853. package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
  854. package/cpp/llama.cpp/prompts/assistant.txt +0 -31
  855. package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  856. package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
  857. package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  858. package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  859. package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  860. package/cpp/llama.cpp/prompts/chat.txt +0 -28
  861. package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
  862. package/cpp/llama.cpp/prompts/dan.txt +0 -1
  863. package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
  864. package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
  865. package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
  866. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  867. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  868. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  869. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
  870. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
  871. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
  872. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
  873. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
  874. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
  875. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
  876. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
  877. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
  878. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
  879. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
  880. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
  881. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
  882. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
  883. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
  884. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
  885. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
  886. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
  887. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
  888. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
  889. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
  890. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
  891. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
  892. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  893. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
  894. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
  895. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
  896. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
  897. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
  898. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
  899. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
  900. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
  901. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
  902. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
  903. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
  904. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  905. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  906. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  907. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  908. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
  909. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  910. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  911. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  912. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  913. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  914. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  915. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
  916. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
  917. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
  918. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
  919. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
  920. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  921. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  922. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  923. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  924. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
  925. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  926. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  927. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  928. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  929. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  930. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  931. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  932. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  933. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  934. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
  935. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  936. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  937. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  938. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  939. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
  940. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  941. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  942. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  943. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  944. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  945. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  946. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
  947. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
  948. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
  949. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
  950. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
  951. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  952. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  953. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  954. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
  955. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
  956. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  957. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  958. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  959. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  960. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  961. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  962. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  963. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  964. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  965. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
  966. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  967. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  968. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  969. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  970. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  971. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  972. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
  973. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
  974. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  975. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  976. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -15,13 +15,12 @@
15
15
 
16
16
  #include <CL/cl.h>
17
17
 
18
+ #include <inttypes.h>
18
19
  #include <string.h>
19
20
 
20
21
  #include <cstddef>
21
22
  #include <cstdint>
22
- #include <atomic>
23
23
  #include <fstream>
24
- #include <limits>
25
24
  #include <vector>
26
25
  #include <string>
27
26
  #include <cmath>
@@ -54,6 +53,37 @@
54
53
 
55
54
  bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
56
55
 
56
+ // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
57
+ // Precompute mp (m' in the paper) and L such that division
58
+ // can be computed using a multiply (high 32b of 64b result)
59
+ // and a shift:
60
+ //
61
+ // n/d = (mulhi(n, mp) + n) >> L;
62
+ struct fastdiv_vals {
63
+ uint32_t mp;
64
+ uint32_t L;
65
+ uint32_t d;
66
+ uint32_t pad;
67
+ };
68
+ static_assert(sizeof(fastdiv_vals) == 16, "fastdiv_vals size incorrect");
69
+
70
+ static fastdiv_vals init_fastdiv_values(uint64_t d_64) {
71
+ GGML_ASSERT(d_64 != 0);
72
+ GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
73
+
74
+ uint32_t d = (uint32_t)d_64;
75
+
76
+ // compute L = ceil(log2(d));
77
+ uint32_t L = 0;
78
+ while (L < 32 && (uint32_t{ 1 } << L) < d) {
79
+ L++;
80
+ }
81
+
82
+ uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
83
+ // pack divisor as well to reduce error surface
84
+ return { mp, L, d, 0 };
85
+ }
86
+
57
87
  enum GPU_FAMILY {
58
88
  ADRENO,
59
89
  INTEL,
@@ -367,7 +397,9 @@ struct ggml_backend_opencl_context {
367
397
  cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
368
398
  cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
369
399
  cl_program program_mul_mv_q6_K;
400
+ cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat;
370
401
  cl_program program_mul_mv_mxfp4_f32;
402
+ cl_program program_mul_mv_mxfp4_f32_flat;
371
403
  cl_program program_mul_mv_f16_f16;
372
404
  cl_program program_mul_mv_f16_f32_1row;
373
405
  cl_program program_mul_mv_f16_f32_l4;
@@ -375,6 +407,8 @@ struct ggml_backend_opencl_context {
375
407
  cl_program program_mul_mv_f32_f32;
376
408
  cl_program program_mul;
377
409
  cl_program program_mul_mat_f16_f32_tiled;
410
+ cl_program program_mul_mm_f16_f32_kqv;
411
+ cl_program program_mul_mm_f16_f32_kq;
378
412
  cl_program program_div;
379
413
  cl_program program_sub;
380
414
  cl_program program_norm;
@@ -400,10 +434,14 @@ struct ggml_backend_opencl_context {
400
434
  cl_program program_conv_2d_f32;
401
435
  cl_program program_conv_2d_f16_f32;
402
436
  cl_program program_tsembd;
437
+ cl_program program_gemv_moe_mxfp4_f32, program_gemm_moe_mxfp4_f32;
403
438
  cl_program program_mul_mv_id_q4_0_f32_8x_flat;
439
+ cl_program program_mul_mv_id_q8_0_f32, program_mul_mv_id_q8_0_f32_flat;
404
440
  cl_program program_mul_mv_id_mxfp4_f32;
441
+ cl_program program_mul_mv_id_mxfp4_f32_flat;
405
442
  cl_program program_mul_mm_f32_f32_l4_lm;
406
443
  cl_program program_mul_mm_f16_f32_l4_lm;
444
+ cl_program program_mul_mm_q8_0_f32_l4_lm;
407
445
 
408
446
  cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
409
447
  cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
@@ -435,7 +473,7 @@ struct ggml_backend_opencl_context {
435
473
  std::map<std::pair<int, int>, int> kernels_flash_attn_bm;
436
474
  std::map<std::pair<int, int>, int> kernels_flash_attn_bn;
437
475
  cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
438
- cl_kernel kernel_set_rows_f32, kernel_set_rows_f16;
476
+ cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
439
477
  cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
440
478
  cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
441
479
  cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
@@ -445,13 +483,18 @@ struct ggml_backend_opencl_context {
445
483
  cl_kernel kernel_mul_mat_f16_f32;
446
484
  cl_kernel kernel_mul_mat_f16_f32_l4;
447
485
  cl_kernel kernel_mul_mat_f16_f32_tiled;
486
+ cl_kernel kernel_mul_mm_f16_f32_kqv;
487
+ cl_kernel kernel_mul_mm_f16_f32_kq;
448
488
  cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
449
489
  cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
490
+ cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
491
+ cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
450
492
  cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
451
493
  cl_kernel kernel_convert_block_q4_0_noshuffle;
452
494
  cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
453
495
  cl_kernel kernel_mul_mv_q6_K_f32;
454
- cl_kernel kernel_mul_mv_mxfp4_f32;
496
+ cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
497
+ cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
455
498
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
456
499
  cl_kernel kernel_argsort_f32_i32;
457
500
  cl_kernel kernel_sum_rows_f32;
@@ -467,10 +510,14 @@ struct ggml_backend_opencl_context {
467
510
  cl_kernel kernel_conv_2d_f32;
468
511
  cl_kernel kernel_conv_2d_f16_f32;
469
512
  cl_kernel kernel_timestep_embedding;
513
+ cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
470
514
  cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
515
+ cl_kernel kernel_mul_mv_id_q8_0_f32, kernel_mul_mv_id_q8_0_f32_flat;
471
516
  cl_kernel kernel_mul_mv_id_mxfp4_f32;
517
+ cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
472
518
  cl_kernel kernel_mul_mm_f32_f32_l4_lm;
473
519
  cl_kernel kernel_mul_mm_f16_f32_l4_lm;
520
+ cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
474
521
 
475
522
  std::vector<ProfilingInfo> profiling_info;
476
523
 
@@ -520,25 +567,17 @@ struct ggml_backend_opencl_context {
520
567
  }
521
568
 
522
569
  // Dump a csv
523
- float total_kernel_time = 0;
524
- fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
570
+ fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n");
525
571
  for (const ProfilingInfo & info : profiling_info) {
526
- total_kernel_time += info.cmd_duration_ns/1.e6f;
527
- fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
572
+ fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
528
573
  info.op_name.c_str(), info.kernel_name.c_str(),
529
- info.cmd_queued_duration_ns/1.e6f,
530
- info.cmd_submit_duration_ns/1.e6f,
531
574
  info.cmd_duration_ns/1.e6f,
532
- info.cmd_complete_duration_ns/1.e6f,
533
- info.cmd_total_duration_ns/1.e6f,
534
575
  info.global_size[0], info.global_size[1], info.global_size[2],
535
576
  info.local_size[0], info.local_size[1], info.local_size[2],
536
577
  info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
537
578
  }
538
579
  fclose(fperf);
539
580
 
540
- GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
541
-
542
581
  // Dump a simple chrome trace
543
582
  FILE* ftrace = fopen("cl_trace.json", "w");
544
583
  if (!ftrace) {
@@ -548,14 +587,14 @@ struct ggml_backend_opencl_context {
548
587
 
549
588
  fprintf(ftrace, "[\n");
550
589
  for (const ProfilingInfo & info : profiling_info) {
551
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
590
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
552
591
  info.kernel_name.c_str(), info.cmd_queued/1000);
553
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
592
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
554
593
  info.kernel_name.c_str(), info.cmd_submit/1000);
555
594
 
556
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
595
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
557
596
  info.kernel_name.c_str(), info.cmd_start/1000);
558
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
597
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
559
598
  info.kernel_name.c_str(), info.cmd_end/1000);
560
599
  }
561
600
  fclose(ftrace);
@@ -765,6 +804,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
765
804
  CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
766
805
  CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
767
806
  CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
807
+ CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
808
+ CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err));
809
+ CL_CHECK((backend_ctx->kernel_restore_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4_trans", &err), err));
810
+ CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
811
+ CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
812
+ CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
768
813
  GGML_LOG_CONT(".");
769
814
  }
770
815
 
@@ -986,6 +1031,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
986
1031
  GGML_LOG_CONT(".");
987
1032
  }
988
1033
 
1034
+ // mul_mv_q8_0_f32
1035
+ {
1036
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1037
+ const std::string kernel_src {
1038
+ #include "mul_mv_q8_0_f32.cl.h"
1039
+ };
1040
+ #else
1041
+ const std::string kernel_src = read_file("mul_mv_q8_0_f32.cl");
1042
+ #endif
1043
+ backend_ctx->program_mul_mv_q8_0_f32 =
1044
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1045
+
1046
+ CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32, "kernel_mul_mv_q8_0_f32", &err), err));
1047
+ GGML_LOG_CONT(".");
1048
+ }
1049
+
1050
+ // mul_mv_q8_0_f32_flat
1051
+ {
1052
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1053
+ const std::string kernel_src {
1054
+ #include "mul_mv_q8_0_f32_flat.cl.h"
1055
+ };
1056
+ #else
1057
+ const std::string kernel_src = read_file("mul_mv_q8_0_f32_flat.cl");
1058
+ #endif
1059
+ backend_ctx->program_mul_mv_q8_0_f32_flat =
1060
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1061
+
1062
+ CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32_flat, "kernel_mul_mv_q8_0_f32_flat", &err), err));
1063
+ GGML_LOG_CONT(".");
1064
+ }
1065
+
989
1066
  // mul_mv_mxfp4_f32
990
1067
  {
991
1068
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1002,6 +1079,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1002
1079
  GGML_LOG_CONT(".");
1003
1080
  }
1004
1081
 
1082
+ // mul_mv_mxfp4_f32_flat
1083
+ {
1084
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1085
+ const std::string kernel_src {
1086
+ #include "mul_mv_mxfp4_f32_flat.cl.h"
1087
+ };
1088
+ #else
1089
+ const std::string kernel_src = read_file("mul_mv_mxfp4_f32_flat.cl");
1090
+ #endif
1091
+ backend_ctx->program_mul_mv_mxfp4_f32_flat =
1092
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1093
+
1094
+ CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32_flat, "kernel_mul_mv_mxfp4_f32_flat", &err), err));
1095
+ GGML_LOG_CONT(".");
1096
+ }
1097
+
1005
1098
  // mul_mv_f16_f16
1006
1099
  {
1007
1100
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1130,6 +1223,41 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1130
1223
  GGML_LOG_CONT(".");
1131
1224
  }
1132
1225
 
1226
+ // mul_mm_q8_0_f32_l4_lm
1227
+ {
1228
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1229
+ const std::string kernel_src {
1230
+ #include "mul_mm_q8_0_f32_l4_lm.cl.h"
1231
+ };
1232
+ #else
1233
+ const std::string kernel_src = read_file("mul_mm_q8_0_f32_l4_lm.cl");
1234
+ #endif
1235
+ backend_ctx->program_mul_mm_q8_0_f32_l4_lm =
1236
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1237
+
1238
+ CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_q8_0_f32_l4_lm, "kernel_mul_mm_q8_0_f32_l4_lm", &err), err));
1239
+ GGML_LOG_CONT(".");
1240
+ }
1241
+
1242
+ // mul_mm_f16_f32_kq_kqv
1243
+ {
1244
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1245
+ const std::string kernel_src {
1246
+ #include "mul_mm_f16_f32_kq_kqv.cl.h"
1247
+ };
1248
+ #else
1249
+ const std::string kernel_src = read_file("mul_mm_f16_f32_kq_kqv.cl");
1250
+ #endif
1251
+ backend_ctx->program_mul_mm_f16_f32_kqv =
1252
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts+" -DKQV ");
1253
+ backend_ctx->program_mul_mm_f16_f32_kq =
1254
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1255
+
1256
+ CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_kqv = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_kqv, "mul_mm_f16_f32_kqv", &err), err));
1257
+ CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_kq = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_kq, "mul_mm_f16_f32_kq", &err), err));
1258
+ GGML_LOG_CONT(".");
1259
+ }
1260
+
1133
1261
  // mul
1134
1262
  {
1135
1263
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1339,7 +1467,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1339
1467
 
1340
1468
  if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
1341
1469
  const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
1342
- { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
1470
+ { 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
1343
1471
  {112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
1344
1472
  {192, 192, 16, 16}, {256, 256, 16, 16},
1345
1473
  };
@@ -1649,8 +1777,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1649
1777
  backend_ctx->program_set_rows =
1650
1778
  build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1651
1779
 
1652
- CL_CHECK((backend_ctx->kernel_set_rows_f32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32", &err), err));
1653
- CL_CHECK((backend_ctx->kernel_set_rows_f16 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16", &err), err));
1780
+ CL_CHECK((backend_ctx->kernel_set_rows_f32_i64 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32_i64", &err), err));
1781
+ CL_CHECK((backend_ctx->kernel_set_rows_f32_i32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32_i32", &err), err));
1782
+ CL_CHECK((backend_ctx->kernel_set_rows_f16_i64 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16_i64", &err), err));
1783
+ CL_CHECK((backend_ctx->kernel_set_rows_f16_i32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16_i32", &err), err));
1654
1784
  GGML_LOG_CONT(".");
1655
1785
  }
1656
1786
 
@@ -1711,6 +1841,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1711
1841
  GGML_LOG_CONT(".");
1712
1842
  }
1713
1843
 
1844
+ // mul_mv_id_q8_0_f32
1845
+ {
1846
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1847
+ const std::string kernel_src {
1848
+ #include "mul_mv_id_q8_0_f32.cl.h"
1849
+ };
1850
+ #else
1851
+ const std::string kernel_src = read_file("mul_mv_id_q8_0_f32.cl");
1852
+ #endif
1853
+ backend_ctx->program_mul_mv_id_q8_0_f32 =
1854
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1855
+
1856
+ CL_CHECK((backend_ctx->kernel_mul_mv_id_q8_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_id_q8_0_f32, "kernel_mul_mv_id_q8_0_f32", &err), err));
1857
+ GGML_LOG_CONT(".");
1858
+ }
1859
+
1860
+ // mul_mv_id_q8_0_f32_flat
1861
+ {
1862
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1863
+ const std::string kernel_src {
1864
+ #include "mul_mv_id_q8_0_f32_flat.cl.h"
1865
+ };
1866
+ #else
1867
+ const std::string kernel_src = read_file("mul_mv_id_q8_0_f32_flat.cl");
1868
+ #endif
1869
+ backend_ctx->program_mul_mv_id_q8_0_f32_flat =
1870
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1871
+
1872
+ CL_CHECK((backend_ctx->kernel_mul_mv_id_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q8_0_f32_flat, "kernel_mul_mv_id_q8_0_f32_flat", &err), err));
1873
+ GGML_LOG_CONT(".");
1874
+ }
1875
+
1714
1876
  // mul_mv_id_mxfp4_f32
1715
1877
  {
1716
1878
  #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1727,6 +1889,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1727
1889
  GGML_LOG_CONT(".");
1728
1890
  }
1729
1891
 
1892
+ // mul_mv_id_mxfp4_f32_flat
1893
+ {
1894
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1895
+ const std::string kernel_src {
1896
+ #include "mul_mv_id_mxfp4_f32_flat.cl.h"
1897
+ };
1898
+ #else
1899
+ const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32_flat.cl");
1900
+ #endif
1901
+ backend_ctx->program_mul_mv_id_mxfp4_f32_flat =
1902
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1903
+
1904
+ CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32_flat, "kernel_mul_mv_id_mxfp4_f32_flat", &err), err));
1905
+ GGML_LOG_CONT(".");
1906
+ }
1907
+
1730
1908
  // Adreno kernels
1731
1909
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1732
1910
  // transpose
@@ -1862,6 +2040,42 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1862
2040
  CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
1863
2041
  GGML_LOG_CONT(".");
1864
2042
  }
2043
+
2044
+ std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
2045
+ " -cl-mad-enable "
2046
+ " -cl-fast-relaxed-math";
2047
+
2048
+ // gemv_moe_mxfp4_f32
2049
+ {
2050
+ #ifdef GGML_OPENCL_EMBED_KERNELS
2051
+ const std::string kernel_src {
2052
+ #include "gemv_moe_mxfp4_f32.cl.h"
2053
+ };
2054
+ #else
2055
+ const std::string kernel_src = read_file("gemv_moe_mxfp4_f32.cl");
2056
+ #endif
2057
+ backend_ctx->program_gemv_moe_mxfp4_f32 =
2058
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
2059
+
2060
+ CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemv_moe_mxfp4_f32, "kernel_gemv_moe_mxfp4_f32", &err), err));
2061
+ GGML_LOG_CONT(".");
2062
+ }
2063
+
2064
+ // gemm_moe_mxfp4_f32
2065
+ {
2066
+ #ifdef GGML_OPENCL_EMBED_KERNELS
2067
+ const std::string kernel_src {
2068
+ #include "gemm_moe_mxfp4_f32.cl.h"
2069
+ };
2070
+ #else
2071
+ const std::string kernel_src = read_file("gemm_moe_mxfp4_f32.cl");
2072
+ #endif
2073
+ backend_ctx->program_gemm_moe_mxfp4_f32 =
2074
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
2075
+
2076
+ CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemm_moe_mxfp4_f32, "kernel_gemm_moe_mxfp4_f32", &err), err));
2077
+ GGML_LOG_CONT(".");
2078
+ }
1865
2079
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1866
2080
  GGML_LOG_CONT("\n");
1867
2081
  }
@@ -2237,8 +2451,13 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
2237
2451
  svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
2238
2452
 
2239
2453
  if (opencl_c_version.major >= 3) {
2454
+ // Assume it is not available for 3.0, since it is optional in 3.0.
2455
+ // If compiling against 3.0, then we can query.
2456
+ backend_ctx->non_uniform_workgroups = false;
2457
+ #if CL_TARGET_OPENCL_VERSION >= 300
2240
2458
  CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
2241
2459
  &backend_ctx->non_uniform_workgroups, 0));
2460
+ #endif
2242
2461
  } else {
2243
2462
  GGML_ASSERT(opencl_c_version.major == 2);
2244
2463
  // Non-uniform workgroup sizes is mandatory feature in v2.x.
@@ -2391,6 +2610,84 @@ struct ggml_tensor_extra_cl_q4_0 {
2391
2610
  }
2392
2611
  };
2393
2612
 
2613
+ struct ggml_tensor_extra_cl_mxfp4 {
2614
+ // Quantized values.
2615
+ cl_mem q = nullptr;
2616
+ // Quantized values in image1d_buffer_t.
2617
+ cl_mem q_img = nullptr;
2618
+ // Scales in E8M0.
2619
+ cl_mem e = nullptr;
2620
+ // Scales in image1d_buffer_t.
2621
+ cl_mem e_img = nullptr;
2622
+ // Size of quantized values.
2623
+ size_t size_q = 0;
2624
+ // Size of scales.
2625
+ size_t size_e = 0;
2626
+
2627
+ ~ggml_tensor_extra_cl_mxfp4() {
2628
+ reset();
2629
+ }
2630
+
2631
+ void reset() {
2632
+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2633
+ // They must be properly released so that the original buffer can be
2634
+ // properly released to avoid memory leak.
2635
+ if (q != nullptr) {
2636
+ CL_CHECK(clReleaseMemObject(q));
2637
+ q = nullptr;
2638
+ }
2639
+ if (e != nullptr) {
2640
+ CL_CHECK(clReleaseMemObject(e));
2641
+ e = nullptr;
2642
+ }
2643
+ if (q != nullptr) {
2644
+ CL_CHECK(clReleaseMemObject(q_img));
2645
+ q = nullptr;
2646
+ }
2647
+ // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2648
+ // that wraps around q and d to utilize image access path.
2649
+ q_img = nullptr;
2650
+ e_img = nullptr;
2651
+ size_q = 0;
2652
+ size_e = 0;
2653
+ }
2654
+ };
2655
+
2656
+ struct ggml_tensor_extra_cl_q8_0 {
2657
+ cl_mem q = nullptr;
2658
+ cl_mem q_img = nullptr;
2659
+
2660
+ cl_mem d = nullptr;
2661
+ cl_mem d_img = nullptr;
2662
+
2663
+ size_t size_q = 0;
2664
+ size_t size_d = 0;
2665
+
2666
+ ~ggml_tensor_extra_cl_q8_0() {
2667
+ reset();
2668
+ }
2669
+
2670
+ void reset() {
2671
+ // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2672
+ // They must be properly released so that the original buffer can be
2673
+ // properly released to avoid memory leak.
2674
+ if (q != nullptr) {
2675
+ CL_CHECK(clReleaseMemObject(q));
2676
+ q = nullptr;
2677
+ }
2678
+ if (d != nullptr) {
2679
+ CL_CHECK(clReleaseMemObject(d));
2680
+ d = nullptr;
2681
+ }
2682
+ // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2683
+ // that wraps around q and d to utilize image access path.
2684
+ q_img = nullptr;
2685
+ d_img = nullptr;
2686
+ size_q = 0;
2687
+ size_d = 0;
2688
+ }
2689
+ };
2690
+
2394
2691
  //------------------------------------------------------------------------------
2395
2692
  // Backend API
2396
2693
  //------------------------------------------------------------------------------
@@ -2492,7 +2789,7 @@ static bool ggml_opencl_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
2492
2789
 
2493
2790
  // if rms_norm is the B operand, then we don't handle broadcast
2494
2791
  if (rms_norm == mul->src[1] &&
2495
- !ggml_are_same_shape(mul->src[0], rms_norm->src[1])) {
2792
+ !ggml_are_same_shape(mul->src[0], rms_norm)) {
2496
2793
  return false;
2497
2794
  }
2498
2795
 
@@ -2616,7 +2913,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2616
2913
  switch (op->type) {
2617
2914
  case GGML_TYPE_F16:
2618
2915
  case GGML_TYPE_F32:
2619
- return true;
2916
+ return (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
2620
2917
  default:
2621
2918
  return false;
2622
2919
  }
@@ -2700,10 +2997,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2700
2997
  case GGML_OP_REPEAT:
2701
2998
  return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
2702
2999
  case GGML_OP_PAD:
2703
- return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
2704
- op->src[0]->ne[3] == 1 && op->ne[3] == 1;
2705
- case GGML_OP_UPSCALE:
2706
3000
  return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
3001
+ case GGML_OP_UPSCALE: {
3002
+ ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);
3003
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
3004
+ (mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR);
3005
+ }
2707
3006
  case GGML_OP_CONV_2D:
2708
3007
  return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) ||
2709
3008
  (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
@@ -2722,10 +3021,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2722
3021
  } else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_MXFP4 ||
2723
3022
  op->src[0]->type == GGML_TYPE_Q6_K) {
2724
3023
  return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
3024
+ } else if (op->src[0]->type == GGML_TYPE_Q8_0) {
3025
+ return op->src[1]->type == GGML_TYPE_F32;
2725
3026
  }
2726
3027
  return false;
2727
3028
  case GGML_OP_MUL_MAT_ID:
2728
3029
  if (op->src[0]->type == GGML_TYPE_Q4_0 ||
3030
+ op->src[0]->type == GGML_TYPE_Q8_0 ||
2729
3031
  op->src[0]->type == GGML_TYPE_MXFP4) {
2730
3032
  if (op->src[1]->type == GGML_TYPE_F32) {
2731
3033
  return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
@@ -2776,10 +3078,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2776
3078
  return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
2777
3079
  case GGML_OP_FLASH_ATTN_EXT:
2778
3080
  {
2779
- if (op->src[4]) {
2780
- return false;
2781
- }
2782
-
2783
3081
  const ggml_tensor * q = op->src[0];
2784
3082
  const ggml_tensor * k = op->src[1];
2785
3083
  const ggml_tensor * v = op->src[2];
@@ -2788,7 +3086,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2788
3086
  const int dv = v->ne[0];
2789
3087
 
2790
3088
  const struct { int dk; int dv; } supported_dims[] = {
2791
- { 64, 64}, { 80, 80}, { 96, 96},
3089
+ { 40, 40}, { 64, 64}, { 80, 80}, { 96, 96},
2792
3090
  {112, 112}, {128, 128}, {192, 128},
2793
3091
  {192, 192}, {256, 256},
2794
3092
  };
@@ -2840,6 +3138,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
2840
3138
  /* .graph_compute = */ ggml_backend_opencl_graph_compute,
2841
3139
  /* .event_record = */ NULL,
2842
3140
  /* .event_wait = */ NULL,
3141
+ /* .graph_optimize = */ NULL,
2843
3142
  };
2844
3143
 
2845
3144
  ggml_backend_t ggml_backend_opencl_init(void) {
@@ -2895,6 +3194,18 @@ struct ggml_backend_opencl_buffer_context {
2895
3194
  for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
2896
3195
  delete e;
2897
3196
  }
3197
+ for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4) {
3198
+ delete e;
3199
+ }
3200
+ for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
3201
+ delete e;
3202
+ }
3203
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
3204
+ delete e;
3205
+ }
3206
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3207
+ delete e;
3208
+ }
2898
3209
  }
2899
3210
 
2900
3211
  ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
@@ -2927,6 +3238,36 @@ struct ggml_backend_opencl_buffer_context {
2927
3238
  return extra;
2928
3239
  }
2929
3240
 
3241
+ ggml_tensor_extra_cl_mxfp4 * ggml_opencl_alloc_temp_tensor_extra_mxfp4() {
3242
+ ggml_tensor_extra_cl_mxfp4 * extra;
3243
+ if (temp_tensor_extras_mxfp4.empty()) {
3244
+ extra = new ggml_tensor_extra_cl_mxfp4();
3245
+ } else {
3246
+ extra = temp_tensor_extras_mxfp4.back();
3247
+ temp_tensor_extras_mxfp4.pop_back();
3248
+ }
3249
+
3250
+ temp_tensor_extras_mxfp4_in_use.push_back(extra);
3251
+
3252
+ extra->reset();
3253
+ return extra;
3254
+ }
3255
+
3256
+ ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0() {
3257
+ ggml_tensor_extra_cl_q8_0 * extra;
3258
+ if (temp_tensor_extras_q8_0.empty()) {
3259
+ extra = new ggml_tensor_extra_cl_q8_0();
3260
+ } else {
3261
+ extra = temp_tensor_extras_q8_0.back();
3262
+ temp_tensor_extras_q8_0.pop_back();
3263
+ }
3264
+
3265
+ temp_tensor_extras_q8_0_in_use.push_back(extra);
3266
+
3267
+ extra->reset();
3268
+ return extra;
3269
+ }
3270
+
2930
3271
  void reset() {
2931
3272
  for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
2932
3273
  temp_tensor_extras.push_back(e);
@@ -2937,6 +3278,16 @@ struct ggml_backend_opencl_buffer_context {
2937
3278
  temp_tensor_extras_q4_0.push_back(e);
2938
3279
  }
2939
3280
  temp_tensor_extras_q4_0_in_use.clear();
3281
+
3282
+ for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
3283
+ temp_tensor_extras_mxfp4.push_back(e);
3284
+ }
3285
+ temp_tensor_extras_mxfp4_in_use.clear();
3286
+
3287
+ for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3288
+ temp_tensor_extras_q8_0.push_back(e);
3289
+ }
3290
+ temp_tensor_extras_q8_0_in_use.clear();
2940
3291
  }
2941
3292
 
2942
3293
  // Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -2948,6 +3299,10 @@ struct ggml_backend_opencl_buffer_context {
2948
3299
  std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
2949
3300
  std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
2950
3301
  std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
3302
+ std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
3303
+ std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
3304
+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3305
+ std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
2951
3306
 
2952
3307
  // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
2953
3308
  // before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -3032,6 +3387,12 @@ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, c
3032
3387
  tensor->ne[2] == 1 && tensor->ne[3] == 1;
3033
3388
  }
3034
3389
 
3390
+ inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
3391
+ GGML_UNUSED(backend_ctx);
3392
+ int ne01 = tensor->ne[1];
3393
+ return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
3394
+ }
3395
+
3035
3396
  static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
3036
3397
  ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
3037
3398
 
@@ -3291,39 +3652,192 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
3291
3652
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
3292
3653
 
3293
3654
  return;
3294
- }
3295
- #endif // GGML_OPENCL_SOA_Q
3296
-
3297
- ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
3298
- GGML_ASSERT(extra);
3299
-
3300
- CL_CHECK(clEnqueueWriteBuffer(
3301
- queue, extra->data_device, CL_TRUE, extra->offset + offset,
3302
- size, data, 0, NULL, NULL));
3303
-
3304
- GGML_UNUSED(buffer);
3305
- }
3306
3655
 
3307
- static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
3308
- GGML_ASSERT(tensor->extra);
3309
-
3310
- ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
3311
-
3312
- cl_context context = backend_ctx->context;
3313
- cl_command_queue queue = backend_ctx->queue;
3656
+ }
3657
+ if (tensor->type == GGML_TYPE_MXFP4) {
3658
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
3659
+ GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
3314
3660
 
3315
- // Make sure all previously submitted commands in other devices are finished.
3316
- sync_with_other_backends(backend_ctx);
3661
+ // Allocate the new extra and create aliases from the original.
3662
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
3663
+ ggml_tensor_extra_cl_mxfp4 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_mxfp4();
3317
3664
 
3318
- #ifdef GGML_OPENCL_SOA_Q
3319
- // In end-to-end runs, get_tensor is usually used to get back the logits,
3320
- // where we can simply do clEnqueueReadBuffer since they are f32.
3321
- // However, in test-backend-ops, the GPU graph is copied to the CPU backend,
3322
- // which requires reading back quantized weight tensors.
3323
- // To properly support this, we need to restore block_q4_0 struct arrays
3324
- // from the flattened buffers.
3325
- if (tensor->type == GGML_TYPE_Q4_0) {
3326
- ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
3665
+ size_t size_e = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(char);
3666
+ size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
3667
+ GGML_ASSERT(size_e + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
3668
+
3669
+ cl_int err;
3670
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3671
+ ggml_nbytes(tensor), NULL, &err);
3672
+ CL_CHECK(err);
3673
+ CL_CHECK(clEnqueueWriteBuffer(
3674
+ queue, data_device, CL_TRUE, 0,
3675
+ ggml_nbytes(tensor), data, 0, NULL, NULL));
3676
+
3677
+ // The original tensor memory is divided into scales and quants, i.e.,
3678
+ // we first store scales, then quants.
3679
+ cl_buffer_region region;
3680
+
3681
+ // Create subbuffer for scales.
3682
+ region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
3683
+ region.size = size_e;
3684
+ extra->e = clCreateSubBuffer(
3685
+ extra_orig->data_device, CL_MEM_READ_WRITE,
3686
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3687
+ CL_CHECK(err);
3688
+ auto previous_origin = region.origin;
3689
+
3690
+ // Create subbuffer for quants.
3691
+ region.origin = align_to(previous_origin + size_e, backend_ctx->alignment);
3692
+ region.size = size_q;
3693
+ extra->q = clCreateSubBuffer(
3694
+ extra_orig->data_device, CL_MEM_READ_WRITE,
3695
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3696
+ CL_CHECK(err);
3697
+
3698
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3699
+ if (use_adreno_moe_kernels(backend_ctx, tensor)) {
3700
+ cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans;
3701
+
3702
+ int ne00 = tensor->ne[0];
3703
+ int ne01 = tensor->ne[1];
3704
+ int ne02 = tensor->ne[2];
3705
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
3706
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
3707
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
3708
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
3709
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
3710
+
3711
+ size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
3712
+ size_t local_work_size[3] = {64, 2, 1};
3713
+
3714
+ cl_event evt;
3715
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3716
+ CL_CHECK(clWaitForEvents(1, &evt));
3717
+ CL_CHECK(clReleaseMemObject(data_device));
3718
+ tensor->extra = extra;
3719
+
3720
+ return;
3721
+ }
3722
+ #endif
3723
+ cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4;
3724
+
3725
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
3726
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
3727
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
3728
+
3729
+ size_t global_work_size[3] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3730
+ size_t local_work_size[3] = {64, 1, 1};
3731
+
3732
+ cl_event evt;
3733
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3734
+ CL_CHECK(clWaitForEvents(1, &evt));
3735
+ CL_CHECK(clReleaseMemObject(data_device));
3736
+
3737
+ // Create image for Q
3738
+ cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
3739
+ cl_image_desc img_desc_q = {
3740
+ CL_MEM_OBJECT_IMAGE1D_BUFFER,
3741
+ static_cast<size_t>(ggml_nelements(tensor)/32*2),
3742
+ 0, 0, 0, 0, 0, 0, 0,
3743
+ { extra->q }
3744
+ };
3745
+ extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
3746
+ tensor->extra = extra;
3747
+
3748
+ return;
3749
+ }
3750
+ if (tensor->type == GGML_TYPE_Q8_0) {
3751
+ ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
3752
+ GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
3753
+
3754
+ // Allocate the new extra and create aliases from the original.
3755
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
3756
+ ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0();
3757
+
3758
+ size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
3759
+ size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)*sizeof(char));
3760
+ GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
3761
+
3762
+ cl_int err;
3763
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3764
+ ggml_nbytes(tensor), NULL, &err);
3765
+ CL_CHECK(err);
3766
+ CL_CHECK(clEnqueueWriteBuffer(
3767
+ queue, data_device, CL_TRUE, 0,
3768
+ ggml_nbytes(tensor), data, 0, NULL, NULL));
3769
+
3770
+ // The original tensor memory is divided into scales and quants, i.e.,
3771
+ // we first store scales, then quants.
3772
+ cl_buffer_region region;
3773
+
3774
+ // Create subbuffer for scales.
3775
+ region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
3776
+ region.size = size_d;
3777
+ extra->d = clCreateSubBuffer(
3778
+ extra_orig->data_device, CL_MEM_READ_WRITE,
3779
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3780
+ CL_CHECK(err);
3781
+ auto previous_origin = region.origin;
3782
+
3783
+ // Create subbuffer for quants.
3784
+ region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
3785
+ region.size = size_q;
3786
+ extra->q = clCreateSubBuffer(
3787
+ extra_orig->data_device, CL_MEM_READ_WRITE,
3788
+ CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
3789
+ CL_CHECK(err);
3790
+
3791
+ cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0;
3792
+
3793
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
3794
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
3795
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
3796
+
3797
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3798
+ size_t local_work_size[] = {64, 1, 1};
3799
+
3800
+ cl_event evt;
3801
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3802
+ CL_CHECK(clWaitForEvents(1, &evt));
3803
+ CL_CHECK(clReleaseMemObject(data_device));
3804
+
3805
+ tensor->extra = extra;
3806
+
3807
+ return;
3808
+ }
3809
+ #endif // GGML_OPENCL_SOA_Q
3810
+
3811
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
3812
+ GGML_ASSERT(extra);
3813
+
3814
+ CL_CHECK(clEnqueueWriteBuffer(
3815
+ queue, extra->data_device, CL_TRUE, extra->offset + offset,
3816
+ size, data, 0, NULL, NULL));
3817
+
3818
+ GGML_UNUSED(buffer);
3819
+ }
3820
+
3821
+ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
3822
+ GGML_ASSERT(tensor->extra);
3823
+
3824
+ ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
3825
+
3826
+ cl_context context = backend_ctx->context;
3827
+ cl_command_queue queue = backend_ctx->queue;
3828
+
3829
+ // Make sure all previously submitted commands in other devices are finished.
3830
+ sync_with_other_backends(backend_ctx);
3831
+
3832
+ #ifdef GGML_OPENCL_SOA_Q
3833
+ // In end-to-end runs, get_tensor is usually used to get back the logits,
3834
+ // where we can simply do clEnqueueReadBuffer since they are f32.
3835
+ // However, in test-backend-ops, the GPU graph is copied to the CPU backend,
3836
+ // which requires reading back quantized weight tensors.
3837
+ // To properly support this, we need to restore block_q4_0 struct arrays
3838
+ // from the flattened buffers.
3839
+ if (tensor->type == GGML_TYPE_Q4_0) {
3840
+ ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
3327
3841
 
3328
3842
  cl_int err;
3329
3843
  cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
@@ -3338,6 +3852,84 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
3338
3852
  size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3339
3853
  size_t local_work_size[] = {1, 1, 1};
3340
3854
 
3855
+ cl_event evt;
3856
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
3857
+ global_work_size, local_work_size, 0, NULL, &evt));
3858
+ CL_CHECK(clWaitForEvents(1, &evt));
3859
+ CL_CHECK(clEnqueueReadBuffer(
3860
+ queue, data_device, CL_TRUE, offset,
3861
+ size, data, 0, NULL, NULL));
3862
+ CL_CHECK(clReleaseMemObject(data_device));
3863
+ return;
3864
+ } else if (tensor->type == GGML_TYPE_MXFP4) {
3865
+ ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra;
3866
+
3867
+ cl_int err;
3868
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3869
+ ggml_nbytes(tensor), NULL, &err);
3870
+ CL_CHECK(err);
3871
+
3872
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3873
+ if (use_adreno_moe_kernels(backend_ctx, tensor)) {
3874
+ cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4_trans;
3875
+
3876
+ int ne00 = tensor->ne[0];
3877
+ int ne01 = tensor->ne[1];
3878
+ int ne02 = tensor->ne[2];
3879
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
3880
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
3881
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
3882
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
3883
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
3884
+
3885
+ size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
3886
+ size_t local_work_size[3] = {64, 2, 1};
3887
+
3888
+ cl_event evt;
3889
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
3890
+ global_work_size, local_work_size, 0, NULL, &evt));
3891
+ CL_CHECK(clWaitForEvents(1, &evt));
3892
+ CL_CHECK(clEnqueueReadBuffer(
3893
+ queue, data_device, CL_TRUE, offset,
3894
+ size, data, 0, NULL, NULL));
3895
+ CL_CHECK(clReleaseMemObject(data_device));
3896
+ return;
3897
+ }
3898
+ #endif
3899
+ cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4;
3900
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
3901
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
3902
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
3903
+
3904
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3905
+ size_t local_work_size[] = {1, 1, 1};
3906
+
3907
+ cl_event evt;
3908
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
3909
+ global_work_size, local_work_size, 0, NULL, &evt));
3910
+ CL_CHECK(clWaitForEvents(1, &evt));
3911
+ CL_CHECK(clEnqueueReadBuffer(
3912
+ queue, data_device, CL_TRUE, offset,
3913
+ size, data, 0, NULL, NULL));
3914
+ CL_CHECK(clReleaseMemObject(data_device));
3915
+ return;
3916
+ }
3917
+ if (tensor->type == GGML_TYPE_Q8_0) {
3918
+ ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra;
3919
+
3920
+ cl_int err;
3921
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3922
+ ggml_nbytes(tensor), NULL, &err);
3923
+ CL_CHECK(err);
3924
+
3925
+ cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
3926
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
3927
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
3928
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
3929
+
3930
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3931
+ size_t local_work_size[] = {1, 1, 1};
3932
+
3341
3933
  cl_event evt;
3342
3934
  CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
3343
3935
  global_work_size, local_work_size, 0, NULL, &evt));
@@ -3659,6 +4251,19 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
3659
4251
  CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
3660
4252
  CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL));
3661
4253
  CL_CHECK(clFinish(queue));
4254
+ } else if (tensor->type == GGML_TYPE_MXFP4) {
4255
+ ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *) tensor->extra;
4256
+ GGML_ASSERT(extra);
4257
+
4258
+ size_t size_q = ggml_nelements(tensor)/QK_MXFP4 * QK_MXFP4/2;
4259
+ size_t size_e = ggml_nelements(tensor)/QK_MXFP4 * sizeof(char);
4260
+ GGML_ASSERT(size_q + size_e == ggml_nbytes(tensor));
4261
+ buf_q = malloc(size_q);
4262
+ buf_d = malloc(size_e);
4263
+
4264
+ CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
4265
+ CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_e, buf_d, 0, NULL, NULL));
4266
+ CL_CHECK(clFinish(queue));
3662
4267
  } else {
3663
4268
  // Read out the tensor from GPU memory.
3664
4269
  ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
@@ -3782,15 +4387,19 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3782
4387
  GGML_ASSERT(dst);
3783
4388
  GGML_ASSERT(dst->extra);
3784
4389
 
3785
- const int ne00 = src0 ? src0->ne[0] : 0;
3786
- const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
3787
- const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
3788
- const int ne10 = src1 ? src1->ne[0] : 0;
3789
- const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
3790
- const int ne11 = src1 ? src1->ne[1] : 0;
3791
- const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
3792
- const cl_ulong nb1 = dst ? dst->nb[1] : 0;
3793
- const cl_ulong nb2 = dst ? dst->nb[2] : 0;
4390
+ const int ne00 = src0->ne[0];
4391
+ const cl_ulong nb01 = src0->nb[1];
4392
+ const cl_ulong nb02 = src0->nb[2];
4393
+ const cl_ulong nb03 = src0->nb[3];
4394
+ const int ne10 = src1->ne[0];
4395
+ const cl_ulong nb10 = src1->nb[0];
4396
+ const int ne11 = src1->ne[1];
4397
+ const int ne12 = src1->ne[2];
4398
+ const cl_ulong nb11 = src1->nb[1];
4399
+ const cl_ulong nb12 = src1->nb[2];
4400
+ const cl_ulong nb1 = dst->nb[1];
4401
+ const cl_ulong nb2 = dst->nb[2];
4402
+ const cl_ulong nb3 = dst->nb[3];
3794
4403
 
3795
4404
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3796
4405
 
@@ -3827,14 +4436,17 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3827
4436
  CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
3828
4437
  CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3829
4438
  CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3830
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
3831
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb10));
3832
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
3833
- CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
3834
- CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
3835
-
3836
- size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
3837
- size_t local_work_size[] = {1, 1, 1};
4439
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
4440
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
4441
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb10));
4442
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
4443
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
4444
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
4445
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
4446
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
4447
+
4448
+ size_t global_work_size[] = {(size_t)ne10*64, (size_t)ne11, (size_t)ne12};
4449
+ size_t local_work_size[] = {64, 1, 1};
3838
4450
 
3839
4451
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
3840
4452
  }
@@ -3846,6 +4458,7 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3846
4458
  GGML_ASSERT(src1->extra);
3847
4459
  GGML_ASSERT(dst);
3848
4460
  GGML_ASSERT(dst->extra);
4461
+ GGML_ASSERT(src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32);
3849
4462
 
3850
4463
  // ne0 = ne00
3851
4464
  // ne2 = ne02
@@ -3888,15 +4501,26 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3888
4501
 
3889
4502
  switch (dst->type) {
3890
4503
  case GGML_TYPE_F32:
3891
- kernel = backend_ctx->kernel_set_rows_f32;
4504
+ if (src1->type == GGML_TYPE_I64) {
4505
+ kernel = backend_ctx->kernel_set_rows_f32_i64;
4506
+ } else {
4507
+ kernel = backend_ctx->kernel_set_rows_f32_i32;
4508
+ }
3892
4509
  break;
3893
4510
  case GGML_TYPE_F16:
3894
- kernel = backend_ctx->kernel_set_rows_f16;
4511
+ if (src1->type == GGML_TYPE_I64) {
4512
+ kernel = backend_ctx->kernel_set_rows_f16_i64;
4513
+ } else {
4514
+ kernel = backend_ctx->kernel_set_rows_f16_i32;
4515
+ }
3895
4516
  break;
3896
4517
  default:
3897
4518
  GGML_ABORT("not implemented");
3898
4519
  }
3899
4520
 
4521
+ fastdiv_vals ne11_ = init_fastdiv_values(ne11);
4522
+ fastdiv_vals ne12_ = init_fastdiv_values(ne12);
4523
+
3900
4524
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3901
4525
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3902
4526
  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
@@ -3907,8 +4531,8 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
3907
4531
  CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
3908
4532
  CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
3909
4533
  CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
3910
- CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
3911
- CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
4534
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne11_));
4535
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(fastdiv_vals), &ne12_));
3912
4536
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
3913
4537
  CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
3914
4538
  CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
@@ -5081,7 +5705,7 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor *
5081
5705
  CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
5082
5706
  CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
5083
5707
  CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &eps));
5084
- CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*nth/sgs, NULL));
5708
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs, NULL));
5085
5709
 
5086
5710
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5087
5711
  }
@@ -5425,7 +6049,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
5425
6049
  GGML_ASSERT(dst->extra);
5426
6050
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
5427
6051
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
5428
- GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
5429
6052
 
5430
6053
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5431
6054
 
@@ -5443,28 +6066,67 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
5443
6066
  const int s_ne0 = src0->ne[0];
5444
6067
  const int s_ne1 = src0->ne[1];
5445
6068
  const int s_ne2 = src0->ne[2];
6069
+ const int s_ne3 = src0->ne[3];
6070
+
6071
+ const int s_nb0 = src0->nb[0];
6072
+ const int s_nb1 = src0->nb[1];
6073
+ const int s_nb2 = src0->nb[2];
6074
+ const int s_nb3 = src0->nb[3];
5446
6075
 
5447
6076
  const int d_ne0 = dst->ne[0];
5448
6077
  const int d_ne1 = dst->ne[1];
5449
6078
  const int d_ne2 = dst->ne[2];
6079
+ const int d_ne3 = dst->ne[3];
6080
+
6081
+ const int d_nb0 = dst->nb[0];
6082
+ const int d_nb1 = dst->nb[1];
6083
+ const int d_nb2 = dst->nb[2];
6084
+ const int d_nb3 = dst->nb[3];
6085
+
6086
+ const int lp0 = ((const int*)(dst->op_params))[0];
6087
+ const int rp0 = ((const int*)(dst->op_params))[1];
6088
+ const int lp1 = ((const int*)(dst->op_params))[2];
6089
+ const int rp1 = ((const int*)(dst->op_params))[3];
6090
+ const int lp2 = ((const int*)(dst->op_params))[4];
6091
+ const int rp2 = ((const int*)(dst->op_params))[5];
6092
+ const int lp3 = ((const int*)(dst->op_params))[6];
6093
+ const int rp3 = ((const int*)(dst->op_params))[7];
5450
6094
 
5451
6095
  cl_kernel kernel = backend_ctx->kernel_pad;
5452
6096
 
5453
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
5454
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
5455
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
5456
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
5457
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
5458
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
5459
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
5460
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne0));
5461
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne1));
5462
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne2));
6097
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
6098
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
6099
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
6100
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
6101
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
6102
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
6103
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
6104
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &s_ne3));
6105
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &s_nb0));
6106
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &s_nb1));
6107
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &s_nb2));
6108
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &s_nb3));
6109
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
6110
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
6111
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
6112
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &d_ne3));
6113
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &d_nb0));
6114
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &d_nb1));
6115
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &d_nb2));
6116
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &d_nb3));
6117
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &lp0));
6118
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &rp0));
6119
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &lp1));
6120
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &rp1));
6121
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &lp2));
6122
+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &rp2));
6123
+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &lp3));
6124
+ CL_CHECK(clSetKernelArg(kernel, 27, sizeof(int), &rp3));
5463
6125
 
5464
6126
  size_t lws0 = 64;
5465
6127
  size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
5466
6128
 
5467
- size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
6129
+ size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2*d_ne3 };
5468
6130
  size_t local_work_size[] = { lws0, 1, 1 };
5469
6131
 
5470
6132
  size_t * local_work_size_ptr = local_work_size;
@@ -5554,8 +6216,8 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
5554
6216
  CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
5555
6217
  } else if (mode == GGML_SCALE_MODE_BILINEAR) {
5556
6218
  if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
5557
- sf0 = (float)(ne0 - 1) / (ne00 - 1);
5558
- sf1 = (float)(ne1 - 1) / (ne01 - 1);
6219
+ sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
6220
+ sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
5559
6221
  pixel_offset = 0.0f;
5560
6222
  }
5561
6223
 
@@ -5670,12 +6332,12 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
5670
6332
  } else {
5671
6333
  cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
5672
6334
 
5673
- long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
6335
+ cl_long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
5674
6336
  cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
5675
6337
 
5676
6338
  cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
5677
6339
 
5678
- long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
6340
+ cl_long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
5679
6341
  cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
5680
6342
 
5681
6343
 
@@ -5686,10 +6348,10 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
5686
6348
  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
5687
6349
  CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
5688
6350
 
5689
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long), &ne00));
5690
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long), &ne01));
5691
- CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long), &ne02));
5692
- CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long), &ne03));
6351
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &ne00));
6352
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &ne01));
6353
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &ne02));
6354
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &ne03));
5693
6355
  CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
5694
6356
  CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
5695
6357
  CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
@@ -5700,10 +6362,10 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
5700
6362
  CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
5701
6363
  CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
5702
6364
 
5703
- CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long), &d_ne0));
5704
- CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long), &d_ne1));
5705
- CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long), &d_ne2));
5706
- CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long), &d_ne3));
6365
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_long), &d_ne0));
6366
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_long), &d_ne1));
6367
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_long), &d_ne2));
6368
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_long), &d_ne3));
5707
6369
  CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
5708
6370
  CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
5709
6371
  CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
@@ -5765,6 +6427,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
5765
6427
  static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) {
5766
6428
  const ggml_tensor * v = dst->src[2];
5767
6429
  const ggml_tensor * mask = dst->src[3];
6430
+ const ggml_tensor * sinks = dst->src[4];
5768
6431
  GGML_ASSERT(q->extra);
5769
6432
  GGML_ASSERT(k->extra);
5770
6433
  GGML_ASSERT(v->extra);
@@ -5772,6 +6435,9 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
5772
6435
  if (mask) {
5773
6436
  GGML_ASSERT(mask->extra);
5774
6437
  }
6438
+ if (sinks) {
6439
+ GGML_ASSERT(sinks->extra);
6440
+ }
5775
6441
 
5776
6442
  ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5777
6443
 
@@ -5813,6 +6479,7 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
5813
6479
  ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *)v->extra;
5814
6480
  ggml_tensor_extra_cl * extra_o = (ggml_tensor_extra_cl *)dst->extra;
5815
6481
  ggml_tensor_extra_cl * extra_mask = mask ? (ggml_tensor_extra_cl *)mask->extra : NULL;
6482
+ ggml_tensor_extra_cl * extra_sinks = sinks ? (ggml_tensor_extra_cl *)sinks->extra : NULL;
5816
6483
 
5817
6484
  cl_ulong offset_q = extra_q->offset + q->view_offs;
5818
6485
  cl_ulong offset_k = extra_k->offset + k->view_offs;
@@ -5820,6 +6487,8 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
5820
6487
  cl_ulong offset_o = extra_o->offset + dst->view_offs;
5821
6488
  cl_mem mask_buffer = extra_mask ? extra_mask->data_device : NULL;
5822
6489
  cl_ulong offset_mask = extra_mask ? extra_mask->offset + mask->view_offs : 0;
6490
+ cl_mem sinks_buffer = extra_sinks ? extra_sinks->data_device : NULL;
6491
+ cl_ulong offset_sinks = extra_sinks ? extra_sinks->offset + sinks->view_offs : 0;
5823
6492
 
5824
6493
  const cl_ulong q_nb1 = q->nb[1], q_nb2 = q->nb[2], q_nb3 = q->nb[3];
5825
6494
  const cl_ulong k_nb1 = k->nb[1], k_nb2 = k->nb[2], k_nb3 = k->nb[3];
@@ -5874,6 +6543,8 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
5874
6543
  CL_CHECK(clSetKernelArg(kernel, 35, sizeof(cl_ulong), &mask_nb3));
5875
6544
  CL_CHECK(clSetKernelArg(kernel, 36, sizeof(int), &mask_ne2));
5876
6545
  CL_CHECK(clSetKernelArg(kernel, 37, sizeof(int), &mask_ne3));
6546
+ CL_CHECK(clSetKernelArg(kernel, 38, sizeof(cl_mem), &sinks_buffer));
6547
+ CL_CHECK(clSetKernelArg(kernel, 39, sizeof(cl_ulong), &offset_sinks));
5877
6548
 
5878
6549
  if (n_q == 1) {
5879
6550
  const size_t wg_size = 64;
@@ -6017,6 +6688,146 @@ static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, co
6017
6688
  backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
6018
6689
  }
6019
6690
 
6691
+ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6692
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6693
+
6694
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6695
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6696
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6697
+
6698
+ const int ne00 = src0->ne[0];
6699
+ const int ne01 = src0->ne[1];
6700
+ const int ne02 = src0->ne[2];
6701
+
6702
+ const cl_ulong nb01 = src0->nb[1];
6703
+ const cl_ulong nb02 = src0->nb[2];
6704
+
6705
+ const int ne10 = src1->ne[0];
6706
+ const int ne11 = src1->ne[1];
6707
+ const int ne12 = src1->ne[2];
6708
+
6709
+ const cl_ulong nb10 = src1->nb[0];
6710
+
6711
+ const int ne0 = dst->ne[0];
6712
+ const int ne1 = dst->ne[1];
6713
+
6714
+ GGML_ASSERT(ne00 == ne10);
6715
+
6716
+ cl_kernel kernel;
6717
+ cl_context context = backend_ctx->context;
6718
+
6719
+ cl_int status;
6720
+ cl_image_format img_fmt_1d;
6721
+ cl_image_desc img_desc_1d;
6722
+ cl_buffer_region region;
6723
+ cl_mem A_image1d;
6724
+ cl_mem A_sub_buffer;
6725
+ cl_mem B_sub_buffer;
6726
+ cl_mem D_image1d;
6727
+ cl_mem D_sub_buffer;
6728
+
6729
+ int M = ne01;
6730
+ int N = ne1;
6731
+ int K = ne00;
6732
+
6733
+ if (nb01 > nb02) {
6734
+ // KQ
6735
+ kernel = backend_ctx->kernel_mul_mm_f16_f32_kq;
6736
+ } else {
6737
+ // KQV
6738
+ kernel = backend_ctx->kernel_mul_mm_f16_f32_kqv;
6739
+ }
6740
+ // create sub-buffer for A
6741
+ // <--------------------------------------------> //
6742
+ extra0 = src0->view_src ? (ggml_tensor_extra_cl *)src0->view_src->extra : (ggml_tensor_extra_cl *)src0->extra;
6743
+
6744
+ region.origin = (extra0->offset);
6745
+ if (nb01 > nb02) {
6746
+ // KQ
6747
+ region.size = nb01 * ne01;
6748
+ } else {
6749
+ // KQV
6750
+ region.size = nb02 * ne02;
6751
+ }
6752
+
6753
+ A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6754
+ CL_CHECK(status);
6755
+
6756
+ // <--------------------------------------------> //
6757
+
6758
+ // create sub-buffer for B
6759
+ // <--------------------------------------------> //
6760
+ region.origin = (extra1->offset);
6761
+ region.size = nb10 * ne10 * ne11 * ne12;
6762
+ B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6763
+ CL_CHECK(status);
6764
+ // <--------------------------------------------> //
6765
+
6766
+ img_fmt_1d = {CL_RGBA, CL_FLOAT};
6767
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
6768
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6769
+ if (nb01 > nb02) {
6770
+ img_desc_1d.image_width = (nb01 * ne01 / 4)/4;
6771
+ }
6772
+ else {
6773
+ img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
6774
+ }
6775
+ img_desc_1d.buffer = A_sub_buffer;
6776
+ A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
6777
+ CL_CHECK(status);
6778
+
6779
+ // create sub-buffer for output C
6780
+ // <--------------------------------------------> //
6781
+ region.origin = (extrad->offset);
6782
+ region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
6783
+ D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
6784
+ CL_CHECK(status);
6785
+ // <--------------------------------------------> //
6786
+
6787
+ // create image for C output
6788
+ // <--------------------------------------------> //
6789
+ img_fmt_1d = {CL_R, CL_FLOAT};
6790
+ memset(&img_desc_1d, 0, sizeof(img_desc_1d));
6791
+ img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
6792
+ img_desc_1d.image_width = ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4;
6793
+ img_desc_1d.buffer = D_sub_buffer;
6794
+ D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
6795
+ CL_CHECK(status);
6796
+ // <--------------------------------------------> //
6797
+
6798
+ int offset_src0 = 0;
6799
+ int offset_src1 = 0;
6800
+
6801
+ // set kernel args
6802
+ // <--------------------------------------------> //
6803
+ cl_uint k_arg = 0;
6804
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
6805
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src0));
6806
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_sub_buffer));
6807
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src1));
6808
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &D_image1d));
6809
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
6810
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &M));
6811
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &K));
6812
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &N));
6813
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
6814
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
6815
+ CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &nb01));
6816
+
6817
+ size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
6818
+ size_t local_work_size[3] = {64, 1, 2};
6819
+
6820
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6821
+
6822
+ // deallocate sub buffers and images
6823
+ // <--------------------------------------------> //
6824
+ CL_CHECK(clReleaseMemObject(A_image1d));
6825
+ CL_CHECK(clReleaseMemObject(D_image1d));
6826
+ CL_CHECK(clReleaseMemObject(A_sub_buffer));
6827
+ CL_CHECK(clReleaseMemObject(B_sub_buffer));
6828
+ CL_CHECK(clReleaseMemObject(D_sub_buffer));
6829
+ }
6830
+
6020
6831
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6021
6832
  GGML_ASSERT(src0);
6022
6833
  GGML_ASSERT(src0->extra);
@@ -6040,6 +6851,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6040
6851
 
6041
6852
  #ifdef GGML_OPENCL_SOA_Q
6042
6853
  ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
6854
+ ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
6855
+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
6043
6856
  #endif
6044
6857
 
6045
6858
  const int ne00 = src0 ? src0->ne[0] : 0;
@@ -6081,6 +6894,27 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6081
6894
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
6082
6895
  cl_context context = backend_ctx->context;
6083
6896
 
6897
+ if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
6898
+ if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0) {
6899
+ // For KQ
6900
+ if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
6901
+ nb00 <= nb02 &&
6902
+ nb02 <= nb01 &&
6903
+ nb01 <= nb03 &&
6904
+ nb10 <= nb12 &&
6905
+ nb12 <= nb11 &&
6906
+ nb11 <= nb13) {
6907
+ ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
6908
+ return;
6909
+ }
6910
+ // For KQV
6911
+ if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
6912
+ ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
6913
+ return;
6914
+ }
6915
+ }
6916
+ }
6917
+
6084
6918
  if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) {
6085
6919
 
6086
6920
  // init CL objects
@@ -6454,6 +7288,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6454
7288
  backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6455
7289
  return;
6456
7290
  }
7291
+ case GGML_TYPE_Q8_0: {
7292
+ if (ne11 < 32) {
7293
+ break;
7294
+ }
7295
+ kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm;
7296
+ nth0 = 128; // calculated as (BM*BN)/(TM*TN)
7297
+
7298
+ int batch_stride_a = ne00*ne01;
7299
+ int batch_stride_b = ne10*ne11;
7300
+ int batch_stride_d = ne0*ne1;
7301
+
7302
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
7303
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
7304
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7305
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
7306
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
7307
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
7308
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
7309
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
7310
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
7311
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
7312
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
7313
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
7314
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
7315
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
7316
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
7317
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
7318
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
7319
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
7320
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
7321
+
7322
+ // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
7323
+ size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
7324
+ size_t local_work_size[] = {(size_t)nth0, 1, 1};
7325
+
7326
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
7327
+ return;
7328
+ }
6457
7329
  default:
6458
7330
  break;
6459
7331
  }
@@ -6709,7 +7581,84 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6709
7581
  #endif // GGML_OPENCL_SOA_Q
6710
7582
  break;
6711
7583
  case GGML_TYPE_Q4_1:
6712
- case GGML_TYPE_Q8_0:
7584
+ case GGML_TYPE_Q8_0: {
7585
+ #ifdef GGML_OPENCL_SOA_Q
7586
+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
7587
+
7588
+ // nth0 - subgroup size
7589
+ // nth1 - number of subgroups per workgroup
7590
+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
7591
+ if (backend_ctx->gpu_family == INTEL) {
7592
+ nth0 = 16;
7593
+ nth1 = 2;
7594
+ ndst = nth1*4;
7595
+ } else if (backend_ctx->gpu_family == ADRENO) {
7596
+ nth0 = 64;
7597
+ nth1 = 2;
7598
+ ndst = nth1*4;
7599
+ } else {
7600
+ GGML_ASSERT(false && "TODO: Unknown GPU");
7601
+ }
7602
+
7603
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
7604
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
7605
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7606
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
7607
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
7608
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
7609
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
7610
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
7611
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
7612
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
7613
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
7614
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
7615
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
7616
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
7617
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
7618
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
7619
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
7620
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
7621
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
7622
+ #else
7623
+ kernel = backend_ctx->kernel_mul_mv_q8_0_f32;
7624
+
7625
+ // nth0 - subgroup size
7626
+ // nth1 - number of subgroups per workgroup
7627
+ // ndst - number of output values per workgroup = output per subgroup * number of subgroups
7628
+ if (backend_ctx->gpu_family == INTEL) {
7629
+ nth0 = 16;
7630
+ nth1 = 2;
7631
+ ndst = nth1*4;
7632
+ } else if (backend_ctx->gpu_family == ADRENO) {
7633
+ nth0 = 64;
7634
+ nth1 = 2;
7635
+ ndst = nth1*4;
7636
+ } else {
7637
+ GGML_ASSERT(false && "TODO: Unknown GPU");
7638
+ }
7639
+
7640
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7641
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7642
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7643
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
7644
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
7645
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
7646
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
7647
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
7648
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
7649
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
7650
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
7651
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
7652
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
7653
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
7654
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
7655
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
7656
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
7657
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
7658
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
7659
+ #endif // GGML_OPENCL_SOA_Q
7660
+ break;
7661
+ }
6713
7662
  case GGML_TYPE_Q2_K:
6714
7663
  case GGML_TYPE_Q3_K:
6715
7664
  case GGML_TYPE_Q4_K:
@@ -6744,6 +7693,45 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6744
7693
  CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
6745
7694
  break;
6746
7695
  case GGML_TYPE_MXFP4: {
7696
+ #ifdef GGML_OPENCL_SOA_Q
7697
+ kernel = backend_ctx->kernel_mul_mv_mxfp4_f32_flat;
7698
+
7699
+ cl_mem q;
7700
+ if (backend_ctx->gpu_family == INTEL) {
7701
+ nth0 = 16;
7702
+ nth1 = 2;
7703
+ ndst = nth1*2;
7704
+
7705
+ q = extra0_mxfp4->q;
7706
+ } else if (backend_ctx->gpu_family == ADRENO) {
7707
+ nth0 = 64;
7708
+ nth1 = 2;
7709
+ ndst = nth1*2;
7710
+
7711
+ q = extra0_mxfp4->q_img;
7712
+ } else {
7713
+ GGML_ASSERT(false && "TODO: Unknown GPU");
7714
+ }
7715
+
7716
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q));
7717
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_mxfp4->e));
7718
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7719
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
7720
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
7721
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
7722
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
7723
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
7724
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
7725
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
7726
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
7727
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
7728
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
7729
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
7730
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne0));
7731
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne1));
7732
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
7733
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
7734
+ #else
6747
7735
  kernel = backend_ctx->kernel_mul_mv_mxfp4_f32;
6748
7736
 
6749
7737
  if (backend_ctx->gpu_family == INTEL) {
@@ -6777,6 +7765,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
6777
7765
  CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
6778
7766
  CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
6779
7767
  CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float)*nth0,nullptr));
7768
+ #endif
6780
7769
  break;
6781
7770
  }
6782
7771
  default:
@@ -6842,8 +7831,12 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6842
7831
  cl_ulong offset2 = extra2->offset + src2->view_offs;
6843
7832
  cl_ulong offsetd = extrad->offset + dst->view_offs;
6844
7833
 
7834
+ GGML_UNUSED(offset0);
7835
+
6845
7836
  #ifdef GGML_OPENCL_SOA_Q
6846
7837
  ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
7838
+ ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
7839
+ ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
6847
7840
  #endif
6848
7841
 
6849
7842
  const int ne00 = src0->ne[0];
@@ -6869,6 +7862,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6869
7862
  const int ne21 = src2->ne[1];
6870
7863
 
6871
7864
  const cl_ulong nb21 = src2->nb[1];
7865
+ const cl_ulong nb20 = src2->nb[0];
7866
+
7867
+ UNUSED(nb20);
6872
7868
 
6873
7869
  const int ne0 = dst->ne[0];
6874
7870
  const int ne1 = dst->ne[1];
@@ -6931,7 +7927,227 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6931
7927
 
6932
7928
  break;
6933
7929
  }
7930
+ case GGML_TYPE_Q8_0: {
7931
+ #ifdef GGML_OPENCL_SOA_Q
7932
+ kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat;
7933
+
7934
+ if (backend_ctx->gpu_family == INTEL) {
7935
+ sgs = 16;
7936
+ nsg = 2;
7937
+ ndst = 4;
7938
+ } else if (backend_ctx->gpu_family == ADRENO) {
7939
+ sgs = 64;
7940
+ nsg = 2;
7941
+ ndst = 4;
7942
+ } else {
7943
+ GGML_ASSERT(false && "TODO: Unknown GPU");
7944
+ }
7945
+
7946
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
7947
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
7948
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7949
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
7950
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
7951
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
7952
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
7953
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
7954
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
7955
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
7956
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
7957
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
7958
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
7959
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
7960
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
7961
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
7962
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne20));
7963
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne21));
7964
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21));
7965
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne0));
7966
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne1));
7967
+ #else
7968
+ kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32;
7969
+
7970
+ if (backend_ctx->gpu_family == INTEL) {
7971
+ sgs = 16;
7972
+ nsg = 2;
7973
+ ndst = 4;
7974
+ } else if (backend_ctx->gpu_family == ADRENO) {
7975
+ sgs = 64;
7976
+ nsg = 2;
7977
+ ndst = 4;
7978
+ } else {
7979
+ GGML_ASSERT(false && "TODO: Unknown GPU");
7980
+ }
7981
+
7982
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7983
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7984
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7985
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
7986
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
7987
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
7988
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
7989
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
7990
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
7991
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
7992
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
7993
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
7994
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
7995
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
7996
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
7997
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
7998
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne20));
7999
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne21));
8000
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21));
8001
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne0));
8002
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne1));
8003
+ #endif // GGML_OPENCL_SOA_Q
8004
+ break;
8005
+ }
6934
8006
  case GGML_TYPE_MXFP4: {
8007
+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
8008
+ if (use_adreno_moe_kernels(backend_ctx, src0)) {
8009
+ cl_int status;
8010
+
8011
+ size_t local_size[3] = {64, 2, 1};
8012
+ size_t global_size[3] = {64, 2, 1};
8013
+
8014
+ cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
8015
+
8016
+ int tile_size = 320;
8017
+ if (ne12 == 1) { // for gemv
8018
+ kernel = backend_ctx->kernel_gemv_moe_mxfp4_f32;
8019
+
8020
+ // create a sub_buffer for src2
8021
+ cl_buffer_region region;
8022
+ region.origin = offset2;
8023
+ region.size = ne20 * ne21 * sizeof(int);
8024
+ buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
8025
+ CL_CHECK(status);
8026
+
8027
+ // set thread grid
8028
+ global_size[0] = static_cast<size_t>(ne01);
8029
+ global_size[1] = 4;
8030
+ global_size[2] = static_cast<size_t>(ne20);
8031
+ local_size[1] = 4;
8032
+ } else { // for gemm
8033
+ kernel = backend_ctx->kernel_gemm_moe_mxfp4_f32;
8034
+
8035
+ // preprocess router table
8036
+ int num_tiles_per_expert = (ne01 + tile_size - 1) / tile_size;
8037
+ void * host_src2_reorder = malloc(ne20 * ne21 * 4 * num_tiles_per_expert * sizeof(short));
8038
+ void * host_src2 = malloc(ne21 * nb21);
8039
+ CL_CHECK(clEnqueueReadBuffer(backend_ctx->queue, extra2->data_device, CL_TRUE, offset2, ne21 * nb21, host_src2, 0, NULL, NULL));
8040
+ int total_experts = nb21 / nb20;
8041
+ int out_idx = 0;
8042
+ for (int i_expert = 0; i_expert < ne02; i_expert++) {
8043
+ for (int i_tile = 0; i_tile < num_tiles_per_expert; i_tile++) {
8044
+ for (int j = 0; j < ne21; j++) {
8045
+ for (int i = 0; i < ne20; i++) {
8046
+ int expert = ((int *)host_src2)[j * total_experts + i];
8047
+ if (i_expert == expert) {
8048
+ ((short *)host_src2_reorder)[out_idx] = static_cast<short>(expert);
8049
+ ((short *)host_src2_reorder)[out_idx + 1] = static_cast<short>(j * ne11 + (i % ne11));
8050
+ ((short *)host_src2_reorder)[out_idx + 2] = static_cast<short>(j * ne20 + i);
8051
+ ((short *)host_src2_reorder)[out_idx + 3] = static_cast<short>(i_tile);
8052
+ out_idx += 4;
8053
+ }
8054
+ }
8055
+ }
8056
+ }
8057
+ }
8058
+ buf_src2 = clCreateBuffer(backend_ctx->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ne20 * ne21 * 4 * num_tiles_per_expert * sizeof(short), host_src2_reorder, &status);
8059
+ CL_CHECK(status);
8060
+
8061
+ // set thread grid
8062
+ global_size[0] = static_cast<size_t>(tile_size);
8063
+ global_size[2] = static_cast<size_t>(ne20 * ne21 * num_tiles_per_expert);
8064
+ }
8065
+
8066
+ // create a sub_buffer for src1
8067
+ cl_buffer_region region;
8068
+ region.origin = offset1;
8069
+ region.size = ne10 * ne11 * ne12 * sizeof(float);
8070
+ src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
8071
+ CL_CHECK(status);
8072
+
8073
+ // create image for src1
8074
+ cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
8075
+ cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
8076
+ buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
8077
+ CL_CHECK(status);
8078
+
8079
+ // Set kernel args
8080
+ int arg_idx = 0;
8081
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_mxfp4->q));
8082
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_mxfp4->e));
8083
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image));
8084
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
8085
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device));
8086
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd));
8087
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
8088
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
8089
+ if (ne12 == 1) {
8090
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11));
8091
+ } else {
8092
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &tile_size));
8093
+ }
8094
+
8095
+ // launch kernel
8096
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
8097
+
8098
+ // deallocate sub buffers and images
8099
+ CL_CHECK(clReleaseMemObject(src1_sub_buffer));
8100
+ CL_CHECK(clReleaseMemObject(buf_src1_image));
8101
+ CL_CHECK(clReleaseMemObject(buf_src2));
8102
+ return;
8103
+ } // else fallback to generic kernel
8104
+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
8105
+
8106
+ #ifdef GGML_OPENCL_SOA_Q
8107
+ kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat;
8108
+
8109
+ cl_mem q;
8110
+ if (backend_ctx->gpu_family == INTEL) {
8111
+ sgs = 16;
8112
+ nsg = 2;
8113
+ ndst = 2;
8114
+
8115
+ q = extra0_mxfp4->q;
8116
+ } else if (backend_ctx->gpu_family == ADRENO) {
8117
+ sgs = 64;
8118
+ nsg = 1;
8119
+ ndst = 4;
8120
+
8121
+ q = extra0_mxfp4->q_img;
8122
+ } else {
8123
+ GGML_ASSERT(false && "TODO: Unknown GPU");
8124
+ }
8125
+
8126
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q));
8127
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_mxfp4->e));
8128
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
8129
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
8130
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
8131
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
8132
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
8133
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
8134
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
8135
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
8136
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
8137
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
8138
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
8139
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
8140
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
8141
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
8142
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
8143
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne20));
8144
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne21));
8145
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
8146
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
8147
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
8148
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
8149
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
8150
+ #else // GGML_OPENCL_SOA_Q
6935
8151
  kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32;
6936
8152
 
6937
8153
  if (backend_ctx->gpu_family == INTEL) {
@@ -6971,7 +8187,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
6971
8187
  CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
6972
8188
  CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
6973
8189
  CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,nullptr));
6974
-
8190
+ #endif // GGML_OPENCL_SOA_Q
6975
8191
  break;
6976
8192
  }
6977
8193
  default:
@@ -7404,6 +8620,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
7404
8620
  const bool is_neox = mode & 2;
7405
8621
  const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
7406
8622
  const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
8623
+ const int is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
7407
8624
 
7408
8625
  if (is_mrope) {
7409
8626
  GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
@@ -7494,9 +8711,14 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
7494
8711
  CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor));
7495
8712
  CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast));
7496
8713
  CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow));
8714
+ // both mrope and vision kernels have sections
7497
8715
  if (is_mrope || is_vision) {
7498
8716
  CL_CHECK(clSetKernelArg(kernel, 33, sizeof(int32_t)*4, &sections));
7499
8717
  }
8718
+ // only mrope has is_imrope
8719
+ if (is_mrope && !is_vision) {
8720
+ CL_CHECK(clSetKernelArg(kernel, 34, sizeof(int), &is_imrope));
8721
+ }
7500
8722
 
7501
8723
  size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
7502
8724
  size_t local_work_size[] = {(size_t)nth, 1, 1};