@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (976) hide show
  1. package/RNLlamaCpp.podspec +3 -0
  2. package/android/CMakeLists.txt +2 -0
  3. package/android/src/main/cpp/include/llama.h +44 -21
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakeLists.txt +12 -0
  22. package/cpp/llama.cpp/CODEOWNERS +116 -10
  23. package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
  24. package/cpp/llama.cpp/README.md +13 -5
  25. package/cpp/llama.cpp/build-xcframework.sh +5 -0
  26. package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  27. package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
  28. package/cpp/llama.cpp/common/arg.cpp +303 -795
  29. package/cpp/llama.cpp/common/arg.h +2 -3
  30. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  31. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  32. package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
  33. package/cpp/llama.cpp/common/chat-parser.h +13 -0
  34. package/cpp/llama.cpp/common/chat.cpp +1147 -88
  35. package/cpp/llama.cpp/common/chat.h +16 -3
  36. package/cpp/llama.cpp/common/common.cpp +70 -15
  37. package/cpp/llama.cpp/common/common.h +57 -19
  38. package/cpp/llama.cpp/common/download.cpp +1072 -0
  39. package/cpp/llama.cpp/common/download.h +55 -0
  40. package/cpp/llama.cpp/common/http.h +73 -0
  41. package/cpp/llama.cpp/common/json-partial.cpp +70 -2
  42. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
  43. package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
  44. package/cpp/llama.cpp/common/log.cpp +59 -2
  45. package/cpp/llama.cpp/common/log.h +12 -4
  46. package/cpp/llama.cpp/common/sampling.cpp +84 -8
  47. package/cpp/llama.cpp/common/sampling.h +3 -1
  48. package/cpp/llama.cpp/common/speculative.cpp +1 -1
  49. package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
  50. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
  51. package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
  52. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
  53. package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
  54. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  55. package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  56. package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
  57. package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
  58. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
  59. package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
  60. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
  62. package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
  64. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
  65. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  68. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
  69. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
  70. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
  71. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
  72. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
  87. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
  88. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
  89. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
  90. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
  91. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
  92. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
  93. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  94. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
  100. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
  101. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
  102. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  144. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  152. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  167. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  173. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  174. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  176. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  178. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  179. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  180. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  183. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  184. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  186. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  187. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  188. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  189. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  195. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  196. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  197. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  198. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  199. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  201. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  202. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  203. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  204. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
  207. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
  208. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
  210. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
  211. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
  212. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  216. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  217. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
  218. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
  219. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
  220. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
  225. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  226. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
  227. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
  228. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
  229. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
  230. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  231. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
  232. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  233. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
  234. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  235. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  236. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
  237. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
  238. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  239. package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
  240. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
  241. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  242. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  243. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  244. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
  245. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
  246. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
  247. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
  248. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
  249. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
  250. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
  251. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
  252. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
  253. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  254. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
  255. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
  256. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
  257. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
  258. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
  259. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
  260. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  261. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  262. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  263. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  264. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  265. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  266. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  267. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  268. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  269. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  270. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  271. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  272. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  273. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  274. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  275. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  276. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
  277. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  278. package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
  279. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
  280. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  281. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  282. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
  283. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
  284. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
  285. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
  286. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  287. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  288. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
  289. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  290. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
  291. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
  292. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
  293. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
  294. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
  295. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  296. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  297. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
  298. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  299. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
  300. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
  301. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
  302. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
  303. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
  304. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
  305. package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  306. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  307. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  308. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
  309. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
  310. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
  311. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
  312. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
  313. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
  314. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
  315. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
  316. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  317. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  318. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  319. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
  320. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  321. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
  322. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  323. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  324. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  325. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  326. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  327. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  328. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  329. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  330. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  331. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  332. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  333. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  334. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  335. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  336. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
  337. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  338. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  339. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  340. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
  341. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  342. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  343. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  344. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  345. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
  346. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  347. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  348. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  349. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  350. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  351. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  352. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  353. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  354. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  355. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  356. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  357. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  358. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  359. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  360. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  361. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  362. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  363. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  364. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  365. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  366. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  367. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  368. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  369. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  370. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
  371. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  372. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
  373. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
  374. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
  375. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
  376. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
  377. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  378. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  379. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  380. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  381. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  382. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  383. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  384. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
  385. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  386. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  387. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  388. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  389. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  390. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  391. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
  392. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  393. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  394. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  395. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  396. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  397. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
  398. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
  399. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
  400. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
  401. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
  402. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
  403. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
  404. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
  405. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
  406. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
  407. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  408. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  409. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
  410. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
  411. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
  412. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
  413. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
  414. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  415. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
  416. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
  417. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
  418. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
  419. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
  420. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
  421. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  422. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  423. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  424. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  425. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  426. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  427. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
  428. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  429. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
  430. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  431. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  432. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  433. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  434. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
  435. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  436. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  437. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  438. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
  439. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  440. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
  441. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
  442. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
  443. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
  444. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
  445. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  446. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  447. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  448. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  449. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  450. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  451. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  452. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  453. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  454. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  455. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  456. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  457. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
  458. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  459. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  460. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
  461. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  462. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  463. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  464. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  465. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
  466. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  467. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
  468. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
  469. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
  470. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
  471. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
  472. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  473. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  474. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  475. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  476. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
  477. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  478. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  479. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
  480. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  481. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  482. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  483. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  484. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  485. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  486. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  487. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
  488. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  489. package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  490. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
  491. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  492. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  493. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  494. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  495. package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
  496. package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
  497. package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
  498. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
  499. package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
  500. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
  501. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
  502. package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
  503. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
  504. package/cpp/llama.cpp/include/llama.h +44 -21
  505. package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
  506. package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
  507. package/cpp/llama.cpp/media/llama1-icon.png +0 -0
  508. package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
  509. package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
  510. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
  511. package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
  512. package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
  513. package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
  514. package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
  515. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  516. package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
  517. package/cpp/llama.cpp/src/llama-arch.h +50 -0
  518. package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
  519. package/cpp/llama.cpp/src/llama-batch.h +13 -2
  520. package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
  521. package/cpp/llama.cpp/src/llama-chat.h +4 -0
  522. package/cpp/llama.cpp/src/llama-context.cpp +300 -45
  523. package/cpp/llama.cpp/src/llama-context.h +16 -6
  524. package/cpp/llama.cpp/src/llama-cparams.h +2 -1
  525. package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
  526. package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
  527. package/cpp/llama.cpp/src/llama-graph.h +27 -5
  528. package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
  529. package/cpp/llama.cpp/src/llama-hparams.h +48 -8
  530. package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
  531. package/cpp/llama.cpp/src/llama-impl.h +2 -0
  532. package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
  533. package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  534. package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
  535. package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
  536. package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
  537. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  538. package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
  539. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
  540. package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
  541. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  542. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  543. package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
  544. package/cpp/llama.cpp/src/llama-model.h +40 -4
  545. package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
  546. package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
  547. package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
  548. package/cpp/llama.cpp/src/llama-vocab.h +43 -39
  549. package/cpp/llama.cpp/src/llama.cpp +69 -10
  550. package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
  551. package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
  552. package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
  553. package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
  554. package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
  555. package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
  556. package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
  557. package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  558. package/cpp/llama.cpp/src/models/bert.cpp +176 -0
  559. package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
  560. package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
  561. package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
  562. package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
  563. package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
  564. package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
  565. package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  566. package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
  567. package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
  568. package/cpp/llama.cpp/src/models/deci.cpp +135 -0
  569. package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
  570. package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
  571. package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
  572. package/cpp/llama.cpp/src/models/dream.cpp +105 -0
  573. package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  574. package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
  575. package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
  576. package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
  577. package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
  578. package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
  579. package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  580. package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
  581. package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  582. package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  583. package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  584. package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
  585. package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
  586. package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
  587. package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
  588. package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  589. package/cpp/llama.cpp/src/models/granite.cpp +211 -0
  590. package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  591. package/cpp/llama.cpp/src/models/grok.cpp +159 -0
  592. package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
  593. package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  594. package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  595. package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
  596. package/cpp/llama.cpp/src/models/jais.cpp +86 -0
  597. package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
  598. package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
  599. package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
  600. package/cpp/llama.cpp/src/models/llada.cpp +99 -0
  601. package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
  602. package/cpp/llama.cpp/src/models/llama.cpp +155 -0
  603. package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
  604. package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
  605. package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
  606. package/cpp/llama.cpp/src/models/models.h +485 -0
  607. package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
  608. package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
  609. package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
  610. package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
  611. package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
  612. package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
  613. package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
  614. package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  615. package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
  616. package/cpp/llama.cpp/src/models/orion.cpp +123 -0
  617. package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  618. package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
  619. package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
  620. package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
  621. package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
  622. package/cpp/llama.cpp/src/models/plm.cpp +168 -0
  623. package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
  624. package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
  625. package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
  626. package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
  627. package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
  628. package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
  629. package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  630. package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
  631. package/cpp/llama.cpp/src/models/refact.cpp +94 -0
  632. package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  633. package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
  634. package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  635. package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  636. package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
  637. package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
  638. package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
  639. package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
  640. package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
  641. package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
  642. package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
  643. package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
  644. package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
  645. package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  646. package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
  647. package/cpp/llama.cpp/src/unicode.cpp +77 -0
  648. package/cpp/llama.cpp/src/unicode.h +43 -0
  649. package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
  650. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
  651. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
  652. package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
  653. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
  654. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
  655. package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
  656. package/ios/include/chat.h +16 -3
  657. package/ios/include/common/minja/chat-template.hpp +9 -2
  658. package/ios/include/common/minja/minja.hpp +101 -22
  659. package/ios/include/common.h +57 -19
  660. package/ios/include/json-schema-to-grammar.h +2 -0
  661. package/ios/include/llama.h +44 -21
  662. package/ios/include/log.h +12 -4
  663. package/ios/include/sampling.h +3 -1
  664. package/ios/libs/llama.xcframework/Info.plist +20 -20
  665. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  666. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
  667. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
  668. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
  669. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
  670. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
  671. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
  672. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  673. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  674. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
  675. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
  676. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
  677. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
  678. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
  679. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
  680. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
  681. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  682. package/package.json +10 -4
  683. package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
  684. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
  685. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  686. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
  687. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  688. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
  689. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
  690. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  691. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  692. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  693. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  694. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  695. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  696. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  697. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  698. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  699. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  700. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  701. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  702. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  703. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  704. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  705. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  706. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  707. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  708. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  709. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  710. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  711. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  712. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  713. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  714. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  715. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  716. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  717. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  718. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  719. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  720. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  721. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  722. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  723. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  724. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  725. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  726. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  727. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  728. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  729. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  730. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  731. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  732. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  733. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  734. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  735. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  736. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  737. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  738. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  739. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  740. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  741. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  742. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  743. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  744. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  745. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  746. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  747. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  748. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  749. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  750. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  751. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  752. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  753. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  754. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  755. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  756. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  757. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  758. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  759. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  760. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  761. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  762. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  763. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  764. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  765. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  766. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  767. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  768. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  769. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  770. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  771. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  772. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  773. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  774. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  775. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  776. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
  777. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
  778. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  779. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  780. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  781. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
  782. package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  783. package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  784. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  785. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  786. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  787. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  788. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  789. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  790. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  791. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  792. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  793. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  794. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  795. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  796. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  797. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  798. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  799. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  800. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  801. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  802. package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  803. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  804. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  805. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  806. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  807. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  808. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  809. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  810. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  811. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  812. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  813. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  814. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  815. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  816. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  817. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  818. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  819. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  820. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  821. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  822. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  823. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  824. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  825. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
  826. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
  827. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
  828. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
  829. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
  830. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
  831. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
  832. package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
  833. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
  834. package/cpp/llama.cpp/models/templates/README.md +0 -25
  835. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
  836. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
  837. package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
  838. package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
  839. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
  840. package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
  841. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
  842. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
  843. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
  844. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
  845. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
  846. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
  847. package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
  848. package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
  849. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
  850. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
  851. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
  852. package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
  853. package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
  854. package/cpp/llama.cpp/prompts/assistant.txt +0 -31
  855. package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  856. package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
  857. package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  858. package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  859. package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  860. package/cpp/llama.cpp/prompts/chat.txt +0 -28
  861. package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
  862. package/cpp/llama.cpp/prompts/dan.txt +0 -1
  863. package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
  864. package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
  865. package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
  866. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  867. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  868. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  869. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
  870. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
  871. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
  872. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
  873. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
  874. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
  875. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
  876. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
  877. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
  878. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
  879. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
  880. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
  881. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
  882. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
  883. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
  884. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
  885. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
  886. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
  887. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
  888. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
  889. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
  890. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
  891. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
  892. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  893. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
  894. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
  895. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
  896. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
  897. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
  898. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
  899. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
  900. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
  901. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
  902. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
  903. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
  904. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  905. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  906. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  907. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  908. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
  909. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  910. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  911. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  912. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  913. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  914. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  915. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
  916. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
  917. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
  918. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
  919. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
  920. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  921. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  922. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  923. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  924. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
  925. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  926. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  927. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  928. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  929. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  930. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  931. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  932. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  933. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  934. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
  935. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  936. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  937. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  938. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  939. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
  940. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  941. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  942. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  943. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  944. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  945. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  946. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
  947. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
  948. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
  949. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
  950. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
  951. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  952. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  953. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  954. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
  955. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
  956. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  957. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  958. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  959. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  960. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  961. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  962. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  963. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  964. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  965. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
  966. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  967. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  968. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  969. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  970. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  971. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  972. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
  973. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
  974. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  975. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  976. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -11,10 +11,13 @@
11
11
 
12
12
  #include <webgpu/webgpu_cpp.h>
13
13
 
14
+ #include <atomic>
14
15
  #include <condition_variable>
15
16
  #include <cstring>
16
17
  #include <iostream>
18
+ #include <map>
17
19
  #include <mutex>
20
+ #include <optional>
18
21
  #include <string>
19
22
  #include <vector>
20
23
 
@@ -25,16 +28,76 @@
25
28
  # define WEBGPU_LOG_DEBUG(msg) ((void) 0)
26
29
  #endif // GGML_WEBGPU_DEBUG
27
30
 
31
+ #ifdef GGML_WEBGPU_CPU_PROFILE
32
+ // total timing (aggregated)
33
+ # define WEBGPU_CPU_PROFILE_TOTAL_START(id) auto cpu_total_start_##id = std::chrono::high_resolution_clock::now();
34
+
35
+ # define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx) \
36
+ auto cpu_total_end_##id = std::chrono::high_resolution_clock::now(); \
37
+ double cpu_total_time_##id = \
38
+ std::chrono::duration<double, std::milli>(cpu_total_end_##id - cpu_total_start_##id).count(); \
39
+ (ctx)->cpu_time_ms[#id] += cpu_total_time_##id;
40
+
41
+ // fine-grained timing (not included in totals)
42
+ # define WEBGPU_CPU_PROFILE_DETAIL_START(id) auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now();
43
+
44
+ # define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx) \
45
+ auto cpu_detail_end_##id = std::chrono::high_resolution_clock::now(); \
46
+ double cpu_detail_time_##id = \
47
+ std::chrono::duration<double, std::milli>(cpu_detail_end_##id - cpu_detail_start_##id).count(); \
48
+ (ctx)->cpu_detail_ms[#id] += cpu_detail_time_##id;
49
+ #else
50
+ # define WEBGPU_CPU_PROFILE_TOTAL_START(id)
51
+ # define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx)
52
+ # define WEBGPU_CPU_PROFILE_DETAIL_START(id)
53
+ # define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx)
54
+ #endif // GGML_WEBGPU_CPU_PROFILE
55
+
56
+ #ifdef GGML_WEBGPU_GPU_PROFILE
57
+ # define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 24
58
+ # define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps
59
+ #endif
60
+
28
61
  /* Constants */
29
62
 
30
- #define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 16
31
- #define WEBGPU_MUL_MAT_WG_SIZE 64
32
- #define WEBGPU_NUM_PARAM_BUFS 100
63
+ #define WEBGPU_MUL_MAT_WG_SIZE 256
64
+ #define WEBGPU_NUM_PARAM_BUFS 32u
65
+ #define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8u
66
+ #define WEBGPU_WAIT_ANY_TIMEOUT_MS 0
67
+ // Maximum number of in-flight submissions per-thread, to avoid exhausting the parameter buffer pool
68
+ #define WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD WEBGPU_NUM_PARAM_BUFS / WEBGPU_COMMAND_SUBMIT_BATCH_SIZE
33
69
  #define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
34
70
  #define WEBGPU_NUM_SET_ROWS_ERROR_BUFS 32
35
71
  #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
36
72
  #define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
37
73
 
74
+ // For operations which process a row in parallel, this seems like a reasonable default
75
+ #define WEBGPU_ROW_SPLIT_WG_SIZE 64
76
+
77
+ // Matrix multiplication parameters
78
+
79
+ // Register tiling parameters
80
+ #define WEBGPU_MUL_MAT_TILE_M 8
81
+ #define WEBGPU_MUL_MAT_TILE_N 8
82
+ #define WEBGPU_MUL_MAT_WG_SIZE_M 8
83
+ #define WEBGPU_MUL_MAT_WG_SIZE_N 8
84
+ #define WEBGPU_MUL_MAT_TILE_K 32
85
+
86
+ // Subgroup matrix parameters
87
+ // The number of subgroups in the M dimension
88
+ #define WEBGPU_MUL_MAT_SUBGROUP_M 2
89
+ // The number of subgroups in the N dimension
90
+ #define WEBGPU_MUL_MAT_SUBGROUP_N 2
91
+ // The number of subgroup matrices each subgroup accumulates over
92
+ #define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M 4
93
+ #define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N 2
94
+
95
+ // Matrix-vector multiplication parameters
96
+ #define WEBGPU_MUL_MAT_VEC_WG_SIZE 256
97
+ // Must be multiple of 4 to work with vectorized paths, and must divide mul_mat_vec wg size
98
+ #define WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG 64
99
+ #define WEBGPU_MUL_MAT_VEC_TILE_K 256
100
+
38
101
  /* End Constants */
39
102
 
40
103
  // This is a "fake" base pointer, since WebGPU buffers do not have pointers to their locations.
@@ -62,6 +125,11 @@ struct webgpu_pool_bufs {
62
125
  wgpu::Buffer dev_buf;
63
126
  };
64
127
 
128
+ // The futures to wait on for a single queue submission
129
+ struct webgpu_submission_futures {
130
+ std::vector<wgpu::FutureWaitInfo> futures;
131
+ };
132
+
65
133
  // Holds a pool of parameter buffers for WebGPU operations
66
134
  struct webgpu_buf_pool {
67
135
  std::vector<webgpu_pool_bufs> free;
@@ -108,6 +176,83 @@ struct webgpu_buf_pool {
108
176
  }
109
177
  };
110
178
 
179
+ #ifdef GGML_WEBGPU_GPU_PROFILE
180
+ struct webgpu_gpu_profile_bufs {
181
+ wgpu::Buffer host_buf;
182
+ wgpu::Buffer dev_buf;
183
+ wgpu::QuerySet query_set;
184
+ };
185
+
186
+ // Holds a pool of parameter buffers for WebGPU operations
187
+ struct webgpu_gpu_profile_buf_pool {
188
+ std::vector<webgpu_gpu_profile_bufs> free;
189
+
190
+ std::mutex mutex;
191
+
192
+ std::condition_variable cv;
193
+
194
+ void init(wgpu::Device device,
195
+ int num_bufs,
196
+ size_t buf_size,
197
+ wgpu::BufferUsage dev_buf_usage,
198
+ wgpu::BufferUsage host_buf_usage) {
199
+ for (int i = 0; i < num_bufs; i++) {
200
+ wgpu::Buffer host_buf;
201
+ wgpu::Buffer dev_buf;
202
+ ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_profile_buf");
203
+ ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_profile_buf");
204
+ // Create a query set for 2 timestamps
205
+ wgpu::QuerySetDescriptor ts_query_set_desc = {};
206
+
207
+ ts_query_set_desc.type = wgpu::QueryType::Timestamp;
208
+ ts_query_set_desc.count = 2;
209
+ wgpu::QuerySet ts_query_set = device.CreateQuerySet(&ts_query_set_desc);
210
+
211
+ free.push_back({ host_buf, dev_buf, ts_query_set });
212
+ }
213
+ }
214
+
215
+ webgpu_gpu_profile_bufs alloc_bufs() {
216
+ std::unique_lock<std::mutex> lock(mutex);
217
+ cv.wait(lock, [this] { return !free.empty(); });
218
+ webgpu_gpu_profile_bufs bufs = free.back();
219
+ free.pop_back();
220
+ return bufs;
221
+ }
222
+
223
+ void free_bufs(std::vector<webgpu_gpu_profile_bufs> bufs) {
224
+ std::lock_guard<std::mutex> lock(mutex);
225
+ free.insert(free.end(), bufs.begin(), bufs.end());
226
+ cv.notify_all();
227
+ }
228
+
229
+ void cleanup() {
230
+ std::lock_guard<std::mutex> lock(mutex);
231
+ for (auto & bufs : free) {
232
+ bufs.host_buf.Destroy();
233
+ bufs.dev_buf.Destroy();
234
+ bufs.query_set.Destroy();
235
+ }
236
+ free.clear();
237
+ }
238
+ };
239
+ #endif
240
+
241
+ struct webgpu_pipeline {
242
+ wgpu::ComputePipeline pipeline;
243
+ std::string name;
244
+ };
245
+
246
+ struct webgpu_command {
247
+ wgpu::CommandBuffer commands;
248
+ webgpu_pool_bufs params_bufs;
249
+ std::optional<webgpu_pool_bufs> set_rows_error_bufs;
250
+ #ifdef GGML_WEBGPU_GPU_PROFILE
251
+ webgpu_gpu_profile_bufs timestamp_query_bufs;
252
+ std::string pipeline_name;
253
+ #endif
254
+ };
255
+
111
256
  // All the base objects needed to run operations on a WebGPU device
112
257
  struct webgpu_context_struct {
113
258
  wgpu::Instance instance;
@@ -116,35 +261,64 @@ struct webgpu_context_struct {
116
261
  wgpu::Queue queue;
117
262
  wgpu::Limits limits;
118
263
 
264
+ bool supports_subgroup_matrix = false;
265
+ uint32_t subgroup_size;
266
+ wgpu::SubgroupMatrixConfig subgroup_matrix_config;
267
+
268
+ // Separate this out from limits since on some Metal systems, the limit returned by
269
+ // querying the limits is higher than the actual allowed maximum.
270
+ uint32_t max_wg_size_x;
271
+
119
272
  std::recursive_mutex mutex;
273
+ std::atomic_uint inflight_threads = 0;
120
274
 
121
275
  webgpu_buf_pool param_buf_pool;
122
276
  webgpu_buf_pool set_rows_error_buf_pool;
123
277
 
124
- wgpu::ComputePipeline memset_pipeline;
125
- wgpu::ComputePipeline mul_mat_pipeline[30][2];
126
- wgpu::ComputePipeline set_rows_pipeline;
127
- wgpu::ComputePipeline cpy_pipeline;
278
+ webgpu_pipeline memset_pipeline;
279
+
280
+ std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> mul_mat_pipelines; // src0_type, src1_type, vectorized
281
+ std::map<int, std::map<int, std::map<int, webgpu_pipeline>>>
282
+ mul_mat_vec_pipelines; // src0_type, src1_type, vectorized
283
+
284
+ webgpu_pipeline mul_mat_pipeline[30][2];
285
+ webgpu_pipeline set_rows_pipeline[1][2]; // dst->type, vectorized
286
+ webgpu_pipeline get_rows_pipeline[30];
287
+ webgpu_pipeline get_rows_f32_no_vec_pipeline;
288
+ webgpu_pipeline cpy_pipeline[2][2]; // src type, dst type
289
+ webgpu_pipeline add_pipeline[2][2]; // type, inplace
290
+ webgpu_pipeline sub_pipeline[2][2]; // type, inplace
291
+ webgpu_pipeline mul_pipeline[2][2]; // type, inplace
292
+ webgpu_pipeline div_pipeline[2][2]; // type, inplace
293
+ webgpu_pipeline rms_norm_pipeline[2]; // inplace
294
+ webgpu_pipeline rope_pipeline[2][2][2]; // type, ff, inplace
295
+ webgpu_pipeline glu_pipeline[7][2][2]; // glu-op, type, split
296
+ webgpu_pipeline scale_pipeline[2]; // inplace
297
+ webgpu_pipeline soft_max_pipeline[3][2][2]; // (no_mask, f32_mask, f16_mask), has_sink, inplace
128
298
 
129
299
  size_t memset_bytes_per_thread;
130
300
 
131
301
  // Staging buffer for reading data from the GPU
132
302
  wgpu::Buffer get_tensor_staging_buf;
133
303
 
134
- // Command buffers which need to be submitted
135
- std::vector<wgpu::CommandBuffer> staged_command_bufs;
136
-
137
- // Parameter buffers associated with the staged command buffers
138
- std::vector<webgpu_pool_bufs> staged_param_bufs;
139
- // Buffers associated with set_rows operations, used to store potential errors
140
- std::vector<webgpu_pool_bufs> staged_set_row_error_bufs;
141
-
142
- std::vector<wgpu::FutureWaitInfo> callback_futures;
143
-
144
304
  #ifdef GGML_WEBGPU_DEBUG
145
305
  wgpu::Buffer debug_host_buf;
146
306
  wgpu::Buffer debug_dev_buf;
147
307
  #endif
308
+
309
+ #ifdef GGML_WEBGPU_CPU_PROFILE
310
+ // Profiling: labeled CPU time in ms (total)
311
+ std::unordered_map<std::string, double> cpu_time_ms;
312
+ // Profiling: detailed CPU time in ms
313
+ std::unordered_map<std::string, double> cpu_detail_ms;
314
+ #endif
315
+
316
+ #ifdef GGML_WEBGPU_GPU_PROFILE
317
+ // Profiling: per-shader GPU time in ms
318
+ std::unordered_map<std::string, double> shader_gpu_time_ms;
319
+ // Profiling: pool of timestamp query buffers (one per operation)
320
+ webgpu_gpu_profile_buf_pool timestamp_query_buf_pool;
321
+ #endif
148
322
  };
149
323
 
150
324
  typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -169,23 +343,66 @@ struct ggml_backend_webgpu_context {
169
343
  struct ggml_backend_webgpu_buffer_context {
170
344
  webgpu_context webgpu_ctx;
171
345
  wgpu::Buffer buffer;
346
+ std::string label;
172
347
 
173
- ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) :
348
+ ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf, std::string lbl) :
174
349
  webgpu_ctx(std::move(ctx)),
175
- buffer(std::move(buf)) {}
350
+ buffer(std::move(buf)),
351
+ label(std::move(lbl)) {}
176
352
  };
177
353
 
178
354
  /* End struct definitions */
179
355
 
180
356
  /* WebGPU object initializations */
181
357
 
358
+ // Process a WGSL shader string, replacing tokens of the form {{KEY}} with
359
+ // the corresponding values provided in `repls`.
360
+ static std::string ggml_webgpu_process_shader_repls(const char * src,
361
+ const std::map<std::string, std::string> & repls) {
362
+ if (!src) {
363
+ return std::string();
364
+ }
365
+ std::string s = src;
366
+ for (const auto & kv : repls) {
367
+ std::string token = "{{" + kv.first + "}}";
368
+ size_t pos = 0;
369
+ while ((pos = s.find(token, pos)) != std::string::npos) {
370
+ s.replace(pos, token.length(), kv.second);
371
+ pos += kv.second.length();
372
+ }
373
+ }
374
+ return s;
375
+ }
376
+
182
377
  static void ggml_webgpu_create_pipeline(wgpu::Device & device,
183
- wgpu::ComputePipeline & pipeline,
378
+ webgpu_pipeline & pipeline,
184
379
  const char * shader_code,
185
380
  const char * label,
186
381
  const std::vector<wgpu::ConstantEntry> & constants = {}) {
187
- WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()");
382
+ wgpu::ShaderSourceWGSL shader_source;
383
+ shader_source.code = shader_code;
384
+
385
+ wgpu::ShaderModuleDescriptor shader_desc;
386
+ shader_desc.nextInChain = &shader_source;
387
+
388
+ wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc);
389
+
390
+ wgpu::ComputePipelineDescriptor pipeline_desc;
391
+ pipeline_desc.label = label;
392
+ pipeline_desc.compute.module = shader_module;
393
+ pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code
394
+ pipeline_desc.layout = nullptr; // nullptr means auto layout
395
+ if (constants.size() > 0) {
396
+ pipeline_desc.compute.constants = constants.data();
397
+ pipeline_desc.compute.constantCount = constants.size();
398
+ }
399
+ pipeline = { device.CreateComputePipeline(&pipeline_desc), label };
400
+ }
188
401
 
402
+ static webgpu_pipeline ggml_webgpu_create_pipeline2(wgpu::Device & device,
403
+ const char * shader_code,
404
+ const char * label,
405
+ const std::vector<wgpu::ConstantEntry> & constants = {}) {
189
406
  wgpu::ShaderSourceWGSL shader_source;
190
407
  shader_source.code = shader_code;
191
408
 
@@ -203,7 +420,7 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &
203
420
  pipeline_desc.compute.constants = constants.data();
204
421
  pipeline_desc.compute.constantCount = constants.size();
205
422
  }
206
- pipeline = device.CreateComputePipeline(&pipeline_desc);
423
+ return { device.CreateComputePipeline(&pipeline_desc), label };
207
424
  }
208
425
 
209
426
  static void ggml_webgpu_create_buffer(wgpu::Device & device,
@@ -211,8 +428,6 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device,
211
428
  size_t size,
212
429
  wgpu::BufferUsage usage,
213
430
  const char * label) {
214
- WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()");
215
-
216
431
  wgpu::BufferDescriptor buffer_desc;
217
432
  buffer_desc.size = size;
218
433
  buffer_desc.usage = usage;
@@ -228,81 +443,35 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device,
228
443
  /** WebGPU Actions */
229
444
 
230
445
  // Wait for the queue to finish processing all submitted work
231
- static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
232
- std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
233
- if (ctx->callback_futures.empty()) {
234
- // no existing callbacks, wait on queue submission
235
- ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(
236
- wgpu::CallbackMode::AllowSpontaneous,
237
- [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
238
- if (status != wgpu::QueueWorkDoneStatus::Success) {
239
- GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
240
- }
241
- }),
242
- UINT64_MAX);
243
- } else {
244
- // existing callbacks, wait on them
245
- ctx->instance.WaitAny(ctx->callback_futures.size(), ctx->callback_futures.data(), UINT64_MAX);
246
- ctx->callback_futures.clear();
446
+ static void ggml_backend_webgpu_wait(webgpu_context & ctx,
447
+ std::vector<webgpu_submission_futures> & futures,
448
+ bool block = true) {
449
+ // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
450
+ // inflight_max may be 0, meaning that we must wait on all futures.
451
+ uint64_t timeout_ms = block ? UINT64_MAX : 0;
452
+ uint inflight_threads = ctx->inflight_threads;
453
+ uint inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
454
+ while (futures.size() >= inflight_max && futures.size() > 0) {
455
+ ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
456
+ futures.erase(futures.begin());
247
457
  }
248
- }
249
-
250
- static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
251
- std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
252
- WEBGPU_LOG_DEBUG("ggml_backend_webgpu_submit_queue()");
253
- if (ctx->staged_command_bufs.empty()) {
254
- // Nothing to submit
255
- return;
256
- }
257
- ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data());
258
-
259
- // If there are SET_ROWS operations in this submission, copy their error buffers to the host.
260
- if (ctx->staged_set_row_error_bufs.size() > 0) {
261
- wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
262
- for (auto & error_bufs : ctx->staged_set_row_error_bufs) {
263
- // Copy the error buffer to the host buffer
264
- encoder.CopyBufferToBuffer(error_bufs.dev_buf, 0, error_bufs.host_buf, 0, error_bufs.host_buf.GetSize());
458
+ size_t i = 0;
459
+ while (i < futures.size()) {
460
+ auto waitStatus = ctx->instance.WaitAny(futures[i].futures.size(), futures[i].futures.data(), timeout_ms);
461
+ switch (waitStatus) {
462
+ case wgpu::WaitStatus::Success:
463
+ futures.erase(futures.begin() + i);
464
+ break;
465
+ case wgpu::WaitStatus::TimedOut:
466
+ i++;
467
+ break;
468
+ case wgpu::WaitStatus::Error:
469
+ GGML_LOG_ERROR("ggml_webgpu: WaitAny returned an error\n");
470
+ break;
471
+ default:
472
+ GGML_LOG_ERROR("ggml_webgpu: WaitAny returned an unknown status\n");
473
+ break;
265
474
  }
266
- wgpu::CommandBuffer commands = encoder.Finish();
267
- ctx->queue.Submit(1, &commands);
268
- }
269
-
270
- ctx->staged_command_bufs.clear();
271
- std::vector<webgpu_pool_bufs> staged_param_bufs = std::move(ctx->staged_param_bufs);
272
- std::vector<webgpu_pool_bufs> staged_set_row_error_bufs = std::move(ctx->staged_set_row_error_bufs);
273
-
274
- // Free the staged parameter buffers once the submission completes
275
- wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone(
276
- wgpu::CallbackMode::AllowSpontaneous,
277
- [ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
278
- if (status != wgpu::QueueWorkDoneStatus::Success) {
279
- GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
280
- }
281
- // Free the staged buffers
282
- ctx->param_buf_pool.free_bufs(staged_param_bufs);
283
- });
284
- ctx->callback_futures.push_back({ p_f });
285
-
286
- // Check for errrors in SET_ROWS operations
287
- for (auto & error_bufs : staged_set_row_error_bufs) {
288
- wgpu::Future f = error_bufs.host_buf.MapAsync(
289
- wgpu::MapMode::Read,
290
- 0,
291
- error_bufs.host_buf.GetSize(),
292
- wgpu::CallbackMode::AllowSpontaneous,
293
- [ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) {
294
- if (status != wgpu::MapAsyncStatus::Success) {
295
- GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str());
296
- } else {
297
- const uint32_t * error_data = (const uint32_t *) error_bufs.host_buf.GetConstMappedRange();
298
- if (*error_data) {
299
- GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported.");
300
- }
301
- // We can't unmap in here due to WebGPU reentrancy limitations.
302
- ctx->set_rows_error_buf_pool.free_bufs({ error_bufs });
303
- }
304
- });
305
- ctx->callback_futures.push_back({ f });
306
475
  }
307
476
  }
308
477
 
@@ -311,10 +480,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
311
480
  wgpu::MapMode mode,
312
481
  size_t offset,
313
482
  size_t size) {
314
- ctx->instance.WaitAny(buffer.MapAsync(mode,
315
- offset,
316
- size,
317
- wgpu::CallbackMode::AllowSpontaneous,
483
+ ctx->instance.WaitAny(buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
318
484
  [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
319
485
  if (status != wgpu::MapAsyncStatus::Success) {
320
486
  GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n",
@@ -329,7 +495,6 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
329
495
  // To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and
330
496
  // debug statements in the shader, and then call this function after encoding the commands and submitting them.
331
497
  static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
332
- ggml_backend_webgpu_submit_queue(ctx);
333
498
  wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
334
499
  encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize());
335
500
  wgpu::CommandBuffer commands = encoder.Finish();
@@ -346,12 +511,86 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
346
511
  }
347
512
  #endif
348
513
 
349
- static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & ctx,
350
- wgpu::ComputePipeline & pipeline,
351
- std::vector<uint32_t> params,
352
- std::vector<wgpu::BindGroupEntry> bind_group_entries,
353
- uint32_t wg_x,
354
- bool submit_and_wait = false) {
514
+ static webgpu_submission_futures ggml_backend_webgpu_submit(webgpu_context ctx, std::vector<webgpu_command> commands) {
515
+ std::vector<wgpu::CommandBuffer> command_buffers;
516
+ std::vector<webgpu_pool_bufs> params_bufs;
517
+ std::vector<webgpu_pool_bufs> set_rows_error_bufs;
518
+ #ifdef GGML_WEBGPU_GPU_PROFILE
519
+ std::vector<std::pair<std::string, webgpu_gpu_profile_bufs>> pipeline_name_and_ts_bufs;
520
+ #endif
521
+
522
+ for (const auto & command : commands) {
523
+ command_buffers.push_back(command.commands);
524
+ params_bufs.push_back(command.params_bufs);
525
+ if (command.set_rows_error_bufs) {
526
+ set_rows_error_bufs.push_back(command.set_rows_error_bufs.value());
527
+ }
528
+ }
529
+ ctx->queue.Submit(command_buffers.size(), command_buffers.data());
530
+
531
+ std::vector<wgpu::FutureWaitInfo> futures;
532
+
533
+ wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone(
534
+ wgpu::CallbackMode::AllowSpontaneous,
535
+ [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
536
+ if (status != wgpu::QueueWorkDoneStatus::Success) {
537
+ GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
538
+ }
539
+ // Free the staged buffers
540
+ ctx->param_buf_pool.free_bufs({ params_bufs });
541
+ });
542
+ futures.push_back({ p_f });
543
+
544
+ for (const auto & bufs : set_rows_error_bufs) {
545
+ wgpu::Future f = bufs.host_buf.MapAsync(
546
+ wgpu::MapMode::Read, 0, bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
547
+ [ctx, bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) {
548
+ if (status != wgpu::MapAsyncStatus::Success) {
549
+ GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str());
550
+ } else {
551
+ const uint32_t * error_data = (const uint32_t *) bufs.host_buf.GetConstMappedRange();
552
+ if (*error_data) {
553
+ GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported.");
554
+ }
555
+ // We can't unmap in here due to WebGPU reentrancy limitations.
556
+ ctx->set_rows_error_buf_pool.free_bufs({ bufs });
557
+ }
558
+ });
559
+ futures.push_back({ f });
560
+ }
561
+
562
+ #ifdef GGML_WEBGPU_GPU_PROFILE
563
+ for (const auto & command : commands) {
564
+ auto label = command.pipeline_name;
565
+ auto ts_bufs = command.timestamp_query_bufs;
566
+
567
+ wgpu::Future f = ts_bufs.host_buf.MapAsync(
568
+ wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
569
+ [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
570
+ if (status != wgpu::MapAsyncStatus::Success) {
571
+ GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
572
+ } else {
573
+ const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange();
574
+ // WebGPU timestamps are in ns; convert to ms
575
+ double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6;
576
+ ctx->shader_gpu_time_ms[label] += elapsed_ms;
577
+ // We can't unmap in here due to WebGPU reentrancy limitations.
578
+ ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs });
579
+ }
580
+ });
581
+ futures.push_back({ f });
582
+ }
583
+ #endif
584
+ return { futures };
585
+ }
586
+
587
+ static webgpu_command ggml_backend_webgpu_build(webgpu_context & ctx,
588
+ webgpu_pipeline & pipeline,
589
+ std::vector<uint32_t> params,
590
+ std::vector<wgpu::BindGroupEntry> bind_group_entries,
591
+ uint32_t wg_x,
592
+ uint32_t wg_y = 1,
593
+ std::optional<webgpu_pool_bufs> set_rows_error_bufs = std::nullopt) {
355
594
  webgpu_pool_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
356
595
 
357
596
  ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
@@ -369,41 +608,58 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
369
608
  .size = params_bufs.dev_buf.GetSize() });
370
609
 
371
610
  wgpu::BindGroupDescriptor bind_group_desc;
372
- bind_group_desc.layout = pipeline.GetBindGroupLayout(0);
611
+ bind_group_desc.layout = pipeline.pipeline.GetBindGroupLayout(0);
373
612
  bind_group_desc.entryCount = bind_group_entries.size();
374
613
  bind_group_desc.entries = bind_group_entries.data();
614
+ bind_group_desc.label = pipeline.name.c_str();
375
615
  wgpu::BindGroup bind_group = ctx->device.CreateBindGroup(&bind_group_desc);
376
616
 
377
617
  wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
378
618
  encoder.CopyBufferToBuffer(params_bufs.host_buf, 0, params_bufs.dev_buf, 0, params_bufs.dev_buf.GetSize());
619
+
620
+ #ifdef GGML_WEBGPU_GPU_PROFILE
621
+ // --- Profiling: GPU timestamp queries ---
622
+ // Allocate a timestamp query buffer (2 timestamps: start/end)
623
+ webgpu_gpu_profile_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs();
624
+ if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
625
+ ts_bufs.host_buf.Unmap();
626
+ }
627
+
628
+ wgpu::PassTimestampWrites ts_writes = { .querySet = ts_bufs.query_set,
629
+ .beginningOfPassWriteIndex = 0,
630
+ .endOfPassWriteIndex = 1 };
631
+ wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes };
632
+ wgpu::ComputePassEncoder pass = encoder.BeginComputePass(&pass_desc);
633
+ #else
379
634
  wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
380
- pass.SetPipeline(pipeline);
635
+ #endif
636
+ pass.SetPipeline(pipeline.pipeline);
381
637
  pass.SetBindGroup(0, bind_group);
382
- pass.DispatchWorkgroups(wg_x, 1, 1);
638
+ pass.DispatchWorkgroups(wg_x, wg_y, 1);
383
639
  pass.End();
384
- wgpu::CommandBuffer commands = encoder.Finish();
385
- if (submit_and_wait) {
386
- // Submit and wait immediately
387
- ctx->queue.Submit(1, &commands);
388
- ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(
389
- wgpu::CallbackMode::AllowSpontaneous,
390
- [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
391
- if (status != wgpu::QueueWorkDoneStatus::Success) {
392
- GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data);
393
- }
394
- ctx->param_buf_pool.free_bufs({ params_bufs });
395
- }),
396
- UINT64_MAX);
397
- } else {
398
- // Lock the context mutex when pushing to the staging vectors.
399
- std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
400
- // Enqueue commands and only submit if we have enough staged commands
401
- ctx->staged_command_bufs.push_back(commands);
402
- ctx->staged_param_bufs.push_back(params_bufs);
403
- if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
404
- ggml_backend_webgpu_submit_queue(ctx);
405
- }
640
+
641
+ #ifdef GGML_WEBGPU_GPU_PROFILE
642
+ // Resolve the query set into the device buffer
643
+ encoder.ResolveQuerySet(ts_bufs.query_set, 0, 2, ts_bufs.dev_buf, 0);
644
+ encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize());
645
+ #endif
646
+
647
+ // If there are SET_ROWS operations in this submission, copy their error buffers to the host.
648
+ if (set_rows_error_bufs) {
649
+ encoder.CopyBufferToBuffer(set_rows_error_bufs->dev_buf, 0, set_rows_error_bufs->host_buf, 0,
650
+ set_rows_error_bufs->host_buf.GetSize());
406
651
  }
652
+
653
+ wgpu::CommandBuffer commands = encoder.Finish();
654
+ webgpu_command result = {};
655
+ result.commands = commands;
656
+ result.params_bufs = params_bufs;
657
+ result.set_rows_error_bufs = set_rows_error_bufs;
658
+ #ifdef GGML_WEBGPU_GPU_PROFILE
659
+ result.timestamp_query_bufs = ts_bufs;
660
+ result.pipeline_name = pipeline.name;
661
+ #endif
662
+ return result;
407
663
  }
408
664
 
409
665
  static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
@@ -415,9 +671,12 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
415
671
  std::vector<wgpu::BindGroupEntry> entries = {
416
672
  { .binding = 0, .buffer = buf, .offset = 0, .size = buf.GetSize() }
417
673
  };
418
- size_t bytes_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_bytes_per_thread;
674
+ size_t bytes_per_wg = ctx->max_wg_size_x * ctx->memset_bytes_per_thread;
419
675
  uint32_t wg_x = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
420
- ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, true);
676
+
677
+ webgpu_command command = ggml_backend_webgpu_build(ctx, ctx->memset_pipeline, params, entries, wg_x);
678
+ std::vector<webgpu_submission_futures> futures = { ggml_backend_webgpu_submit(ctx, { command }) };
679
+ ggml_backend_webgpu_wait(ctx, futures);
421
680
  }
422
681
 
423
682
  /** End WebGPU Actions */
@@ -433,8 +692,48 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
433
692
  ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context;
434
693
  WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
435
694
 
436
- // TODO: cleanup
695
+ #ifdef GGML_WEBGPU_CPU_PROFILE
696
+ std::cout << "\n[ggml_webgpu cpu profiling summary]\n";
697
+ double total_cpu = 0.0;
698
+ for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) {
699
+ total_cpu += kv.second;
700
+ }
701
+ std::cout << "ggml_webgpu: total cpu time: " << total_cpu << " ms\n";
702
+ std::cout << "ggml_webgpu: cpu breakdown:\n";
703
+ for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) {
704
+ double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0;
705
+ std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
706
+ }
707
+ if (ctx->webgpu_ctx->cpu_detail_ms.size() > 0) {
708
+ std::cout << "ggml_webgpu: cpu detailed breakdown:\n";
709
+ }
710
+ for (const auto & kv : ctx->webgpu_ctx->cpu_detail_ms) {
711
+ double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0;
712
+ std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
713
+ }
714
+ #endif
715
+
716
+ #ifdef GGML_WEBGPU_GPU_PROFILE
717
+ std::cout << "\n[ggml_webgpu gpu profiling summary]\n";
718
+ double total_gpu = 0.0;
719
+ for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
720
+ total_gpu += kv.second;
721
+ }
722
+ std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n";
723
+ std::cout << "\nggml_webgpu: gpu breakdown:\n";
724
+ for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
725
+ double pct = (total_gpu > 0.0) ? (kv.second / total_gpu * 100.0) : 0.0;
726
+ std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
727
+ }
728
+ #endif
729
+
730
+ #if defined(GGML_WEBGPU_CPU_PROFILE) && defined(GGML_WEBGPU_GPU_PROFILE)
731
+ std::cout << "ggml_webgpu: gpu/cpu ratio: " << (total_cpu > 0.0 ? total_gpu / total_cpu : 0.0) << "\n";
732
+ #endif
733
+
734
+ #if !defined(GGML_WEBGPU_CPU_PROFILE) && !defined(GGML_WEBGPU_GPU_PROFILE)
437
735
  GGML_UNUSED(ctx);
736
+ #endif
438
737
  }
439
738
 
440
739
  static size_t ggml_webgpu_tensor_offset(const ggml_tensor * tensor) {
@@ -461,26 +760,27 @@ static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor
461
760
  ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
462
761
  }
463
762
 
464
- static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
763
+ // Used to determine if two tensors are the same for in-place operations
764
+ static bool ggml_webgpu_tensor_equal(ggml_tensor * a, ggml_tensor * b) {
765
+ return (ggml_webgpu_tensor_buf(a).Get() == ggml_webgpu_tensor_buf(b).Get()) &&
766
+ (ggml_webgpu_tensor_offset(a) == ggml_webgpu_tensor_offset(b));
767
+ }
768
+
769
+ static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
465
770
  uint32_t ne = (uint32_t) ggml_nelements(dst);
466
771
 
467
- std::vector<uint32_t> params = { ne,
468
- (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
469
- (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
470
- // Convert byte-strides to element-strides
471
- (uint32_t) (src->nb[0] / ggml_type_size(src->type)),
472
- (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
473
- (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
474
- (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
475
- (uint32_t) (dst->nb[0] / ggml_type_size(dst->type)),
476
- (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
477
- (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
478
- (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
479
- // Logical shape — same for both tensors even if permuted
480
- (uint32_t) src->ne[0],
481
- (uint32_t) src->ne[1],
482
- (uint32_t) src->ne[2],
483
- (uint32_t) src->ne[3] };
772
+ std::vector<uint32_t> params = {
773
+ ne, (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
774
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
775
+ // Convert byte-strides to element-strides
776
+ (uint32_t) (src->nb[0] / ggml_type_size(src->type)), (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
777
+ (uint32_t) (src->nb[2] / ggml_type_size(src->type)), (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
778
+ (uint32_t) (dst->nb[0] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
779
+ (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
780
+ // Logical shapes
781
+ (uint32_t) src->ne[0], (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) dst->ne[0],
782
+ (uint32_t) dst->ne[1], (uint32_t) dst->ne[2]
783
+ };
484
784
 
485
785
  std::vector<wgpu::BindGroupEntry> entries = {
486
786
  { .binding = 0,
@@ -493,15 +793,18 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor
493
793
  .size = ggml_webgpu_tensor_binding_size(ctx, dst) }
494
794
  };
495
795
 
496
- size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
796
+ size_t max_wg_size = ctx->max_wg_size_x;
497
797
  uint32_t wg_x = (ne + max_wg_size - 1) / max_wg_size;
498
- ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline, params, entries, wg_x);
798
+ return ggml_backend_webgpu_build(ctx, ctx->cpy_pipeline[src->type][dst->type], params, entries, wg_x);
499
799
  }
500
800
 
501
- static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
801
+ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
802
+ ggml_tensor * src,
803
+ ggml_tensor * idx,
804
+ ggml_tensor * dst) {
502
805
  // For set rows specifically, we need to check if src and idx are empty tensors.
503
806
  if (ggml_is_empty(src) || ggml_is_empty(idx)) {
504
- return;
807
+ return std::nullopt;
505
808
  }
506
809
 
507
810
  webgpu_pool_bufs error_bufs = ctx->set_rows_error_buf_pool.alloc_bufs();
@@ -509,27 +812,21 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
509
812
  error_bufs.host_buf.Unmap();
510
813
  }
511
814
 
512
- std::vector<uint32_t> params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
513
- (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
514
- (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
515
- // Convert byte-strides to element-strides
516
- (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
517
- (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
518
- (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
519
- (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
520
- (uint32_t) (idx->nb[1] / ggml_type_size(idx->type)),
521
- (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
522
- (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
523
- (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
524
- (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
525
- // Shape of src
526
- (uint32_t) src->ne[0],
527
- (uint32_t) src->ne[1],
528
- (uint32_t) src->ne[2],
529
- (uint32_t) src->ne[3],
530
- // Shape of idx
531
- (uint32_t) (idx->ne[1]),
532
- (uint32_t) (idx->ne[2]) };
815
+ std::vector<uint32_t> params = {
816
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
817
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
818
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
819
+ // Convert byte-strides to element-strides
820
+ (uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
821
+ (uint32_t) (src->nb[3] / ggml_type_size(src->type)), (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
822
+ (uint32_t) (idx->nb[1] / ggml_type_size(idx->type)), (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
823
+ (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
824
+ (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
825
+ // Shape of src
826
+ (uint32_t) src->ne[0], (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) src->ne[3],
827
+ // Shape of idx
828
+ (uint32_t) (idx->ne[1]), (uint32_t) (idx->ne[2])
829
+ };
533
830
 
534
831
  std::vector<wgpu::BindGroupEntry> entries = {
535
832
  { .binding = 0,
@@ -547,22 +844,77 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
547
844
  { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() }
548
845
  };
549
846
 
550
- size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
551
- uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
847
+ size_t max_wg_size = ctx->max_wg_size_x;
552
848
 
553
- std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
554
- ctx->staged_set_row_error_bufs.push_back(error_bufs);
849
+ int vectorized = src->ne[0] % 4 == 0;
850
+ webgpu_pipeline pipeline = ctx->set_rows_pipeline[0][vectorized];
851
+ uint32_t threads;
852
+ if (vectorized) {
853
+ threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4);
854
+ } else {
855
+ threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3];
856
+ }
857
+
858
+ uint32_t wg_x = (threads + max_wg_size - 1) / max_wg_size;
555
859
 
556
- ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x);
860
+ return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, 1, error_bufs);
557
861
  }
558
862
 
559
- static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
863
+ static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx,
864
+ ggml_tensor * src,
865
+ ggml_tensor * idx,
866
+ ggml_tensor * dst) {
867
+ std::vector<uint32_t> params = {
868
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
869
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
870
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
871
+ // Convert byte-strides to element-strides
872
+ (uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
873
+ (uint32_t) (src->nb[3] / ggml_type_size(src->type)), (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
874
+ (uint32_t) (idx->nb[1] / ggml_type_size(idx->type)), (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
875
+ (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
876
+ (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
877
+ // Shape of dst
878
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3],
879
+ // Shape of idx
880
+ (uint32_t) (idx->ne[1]), (uint32_t) (idx->ne[2])
881
+ };
882
+
883
+ std::vector<wgpu::BindGroupEntry> entries = {
884
+ { .binding = 0,
885
+ .buffer = ggml_webgpu_tensor_buf(src),
886
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src),
887
+ .size = ggml_webgpu_tensor_binding_size(ctx, src) },
888
+ { .binding = 1,
889
+ .buffer = ggml_webgpu_tensor_buf(idx),
890
+ .offset = ggml_webgpu_tensor_align_offset(ctx, idx),
891
+ .size = ggml_webgpu_tensor_binding_size(ctx, idx) },
892
+ { .binding = 2,
893
+ .buffer = ggml_webgpu_tensor_buf(dst),
894
+ .offset = ggml_webgpu_tensor_align_offset(ctx, dst),
895
+ .size = ggml_webgpu_tensor_binding_size(ctx, dst) }
896
+ };
897
+
898
+ size_t max_wg_size = ctx->max_wg_size_x;
899
+ uint32_t wg_x = (dst->ne[1] * dst->ne[2] * dst->ne[3] + max_wg_size - 1) / max_wg_size;
900
+
901
+ webgpu_pipeline pipeline = ctx->get_rows_pipeline[src->type];
902
+ if (src->type == GGML_TYPE_F32 && dst->ne[0] % 4 != 0) {
903
+ pipeline = ctx->get_rows_f32_no_vec_pipeline;
904
+ }
905
+ return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
906
+ }
907
+
908
+ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
909
+ ggml_tensor * src0,
910
+ ggml_tensor * src1,
911
+ ggml_tensor * dst) {
560
912
  std::vector<uint32_t> params = {
561
913
  (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
562
914
  (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
563
915
  (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
564
- (uint32_t) dst->ne[1], // number of rows in result (M)
565
- (uint32_t) dst->ne[0], // number of columns in result (N)
916
+ (uint32_t) dst->ne[0], // number of rows in result (M, transposed)
917
+ (uint32_t) dst->ne[1], // number of columns in result (N)
566
918
  (uint32_t) src0->ne[0], // number of columns in src0/src1 (K)
567
919
  (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 1
568
920
  (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 1
@@ -591,46 +943,463 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
591
943
  .size = ggml_webgpu_tensor_binding_size(ctx, dst) },
592
944
  };
593
945
 
946
+ webgpu_pipeline pipeline = ctx->mul_mat_pipeline[src0->type][src1->type];
947
+
594
948
  uint32_t wg_x =
595
949
  (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
596
- ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x);
950
+ uint32_t wg_y = 1;
951
+
952
+ bool use_fast = false;
953
+ switch (src1->type) {
954
+ case GGML_TYPE_F16:
955
+ use_fast = (src0->type == GGML_TYPE_F16);
956
+ break;
957
+ case GGML_TYPE_F32:
958
+ switch (src0->type) {
959
+ case GGML_TYPE_F32:
960
+ case GGML_TYPE_F16:
961
+ case GGML_TYPE_Q4_0:
962
+ use_fast = true;
963
+ break;
964
+ default:
965
+ break;
966
+ }
967
+ break;
968
+ default:
969
+ break;
970
+ }
971
+
972
+ if (use_fast) {
973
+ int vectorized = src0->ne[0] % 4 == 0 && dst->ne[0] % 4 == 0 && dst->ne[1] % 4 == 0;
974
+ if (dst->ne[1] == 1) {
975
+ // We don't support vectorized mul_mat_vec for quantized types
976
+ vectorized = vectorized && (src0->type < 2);
977
+ pipeline = ctx->mul_mat_vec_pipelines[src0->type][src1->type][vectorized];
978
+ uint32_t batches = dst->ne[2] * dst->ne[3];
979
+ uint32_t output_groups =
980
+ (dst->ne[0] + WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG - 1) / WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
981
+ uint32_t total_wg = output_groups * batches;
982
+ wg_x = total_wg % ctx->limits.maxComputeWorkgroupsPerDimension;
983
+ wg_y = (total_wg + ctx->limits.maxComputeWorkgroupsPerDimension - 1) /
984
+ ctx->limits.maxComputeWorkgroupsPerDimension;
985
+ } else {
986
+ pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
987
+ uint32_t wg_m;
988
+ uint32_t wg_n;
989
+ if (ctx->supports_subgroup_matrix) {
990
+ // The total number of subgroups/workgroups needed per matrix.
991
+ uint32_t wg_m_sg_tile =
992
+ WEBGPU_MUL_MAT_SUBGROUP_M * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M * ctx->subgroup_matrix_config.M;
993
+ wg_m = (dst->ne[0] + wg_m_sg_tile - 1) / wg_m_sg_tile;
994
+ uint32_t wg_n_sg_tile =
995
+ WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
996
+ wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
997
+ } else {
998
+ uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
999
+ uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
1000
+ wg_m = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
1001
+ wg_n = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
1002
+ }
1003
+ wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
1004
+ }
1005
+ }
1006
+ return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
1007
+ }
1008
+
1009
+ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
1010
+ ggml_tensor * src0,
1011
+ ggml_tensor * src1,
1012
+ ggml_tensor * dst,
1013
+ webgpu_pipeline & pipeline,
1014
+ bool inplace) {
1015
+ std::vector<uint32_t> params = {
1016
+ (uint32_t) ggml_nelements(dst),
1017
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
1018
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
1019
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
1020
+ (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
1021
+ (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
1022
+ (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
1023
+ (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
1024
+ (uint32_t) src0->ne[0],
1025
+ (uint32_t) src0->ne[1],
1026
+ (uint32_t) src0->ne[2],
1027
+ (uint32_t) src1->ne[0],
1028
+ (uint32_t) src1->ne[1],
1029
+ (uint32_t) src1->ne[2],
1030
+ (uint32_t) src1->ne[3],
1031
+ };
1032
+
1033
+ std::vector<wgpu::BindGroupEntry> entries = {
1034
+ { .binding = 0,
1035
+ .buffer = ggml_webgpu_tensor_buf(src0),
1036
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src0),
1037
+ .size = ggml_webgpu_tensor_binding_size(ctx, src0) },
1038
+ { .binding = 1,
1039
+ .buffer = ggml_webgpu_tensor_buf(src1),
1040
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src1),
1041
+ .size = ggml_webgpu_tensor_binding_size(ctx, src1) }
1042
+ };
1043
+ if (!inplace) {
1044
+ entries.push_back({ .binding = 2,
1045
+ .buffer = ggml_webgpu_tensor_buf(dst),
1046
+ .offset = ggml_webgpu_tensor_align_offset(ctx, dst),
1047
+ .size = ggml_webgpu_tensor_binding_size(ctx, dst) });
1048
+ }
1049
+
1050
+ size_t max_wg_size = ctx->max_wg_size_x;
1051
+ uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
1052
+ return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
1053
+ }
1054
+
1055
+ static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
1056
+ int inplace = ggml_webgpu_tensor_equal(src, dst);
1057
+
1058
+ std::vector<uint32_t> params = {
1059
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
1060
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
1061
+ (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
1062
+ (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
1063
+ (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
1064
+ (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
1065
+ (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
1066
+ (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
1067
+ (uint32_t) src->ne[0],
1068
+ (uint32_t) src->ne[1],
1069
+ (uint32_t) src->ne[2],
1070
+ (uint32_t) src->ne[3],
1071
+ *(uint32_t *) dst->op_params // epsilon, treated as f32 in the shader
1072
+ };
1073
+
1074
+ std::vector<wgpu::BindGroupEntry> entries = {
1075
+ { .binding = 0,
1076
+ .buffer = ggml_webgpu_tensor_buf(src),
1077
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src),
1078
+ .size = ggml_webgpu_tensor_binding_size(ctx, src) }
1079
+ };
1080
+ if (!inplace) {
1081
+ entries.push_back({ .binding = 1,
1082
+ .buffer = ggml_webgpu_tensor_buf(dst),
1083
+ .offset = ggml_webgpu_tensor_align_offset(ctx, dst),
1084
+ .size = ggml_webgpu_tensor_binding_size(ctx, dst) });
1085
+ }
1086
+
1087
+ return ggml_backend_webgpu_build(ctx, ctx->rms_norm_pipeline[inplace], params, entries, ggml_nrows(src));
1088
+ }
1089
+
1090
+ static webgpu_command ggml_webgpu_rope(webgpu_context & ctx,
1091
+ ggml_tensor * src0,
1092
+ ggml_tensor * src1,
1093
+ ggml_tensor * src2,
1094
+ ggml_tensor * dst) {
1095
+ const int inplace = ggml_webgpu_tensor_equal(src0, dst);
1096
+ const int has_freq_factor = (src2 != nullptr);
1097
+
1098
+ const int n_dims = ((int32_t *) dst->op_params)[1];
1099
+ const int mode = ((int32_t *) dst->op_params)[2];
1100
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
1101
+
1102
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1103
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
1104
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
1105
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
1106
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
1107
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1108
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1109
+
1110
+ int sections[4];
1111
+ memcpy(sections, (int32_t *) dst->op_params + 11, 4 * sizeof(int));
1112
+
1113
+ float theta_scale = powf(freq_base, -2.0f / n_dims);
1114
+
1115
+ float corr_dims[2];
1116
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
1117
+
1118
+ std::vector<uint32_t> params = {
1119
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
1120
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
1121
+ src2 != nullptr ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src2) / ggml_type_size(src2->type)) : 0,
1122
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
1123
+ (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
1124
+ (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
1125
+ (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
1126
+ (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
1127
+ (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
1128
+ (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
1129
+ (uint32_t) ggml_nelements(src0) / 2,
1130
+ (uint32_t) src0->ne[0],
1131
+ (uint32_t) src0->ne[1],
1132
+ (uint32_t) src0->ne[2],
1133
+ (uint32_t) n_dims,
1134
+ (uint32_t) mode,
1135
+ *(uint32_t *) &theta_scale,
1136
+ *(uint32_t *) &attn_factor,
1137
+ *(uint32_t *) &freq_scale,
1138
+ *(uint32_t *) &ext_factor,
1139
+ *(uint32_t *) &corr_dims[0],
1140
+ *(uint32_t *) &corr_dims[1],
1141
+ (uint32_t) sections[0],
1142
+ (uint32_t) sections[1],
1143
+ (uint32_t) sections[2],
1144
+ (uint32_t) sections[3]
1145
+ };
1146
+
1147
+ std::vector<wgpu::BindGroupEntry> entries = {
1148
+ { .binding = 0,
1149
+ .buffer = ggml_webgpu_tensor_buf(src0),
1150
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src0),
1151
+ .size = ggml_webgpu_tensor_binding_size(ctx, src0) },
1152
+ { .binding = 1,
1153
+ .buffer = ggml_webgpu_tensor_buf(src1),
1154
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src1),
1155
+ .size = ggml_webgpu_tensor_binding_size(ctx, src1) }
1156
+ };
1157
+ uint32_t dst_binding = 2;
1158
+ if (has_freq_factor) {
1159
+ dst_binding = 3;
1160
+ entries.push_back({ .binding = 2,
1161
+ .buffer = ggml_webgpu_tensor_buf(src2),
1162
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src2),
1163
+ .size = ggml_webgpu_tensor_binding_size(ctx, src2) });
1164
+ }
1165
+ if (!inplace) {
1166
+ entries.push_back({ .binding = dst_binding,
1167
+ .buffer = ggml_webgpu_tensor_buf(dst),
1168
+ .offset = ggml_webgpu_tensor_align_offset(ctx, dst),
1169
+ .size = ggml_webgpu_tensor_binding_size(ctx, dst) });
1170
+ }
1171
+
1172
+ webgpu_pipeline pipeline = ctx->rope_pipeline[dst->type][has_freq_factor][inplace];
1173
+ size_t max_wg_size = ctx->max_wg_size_x;
1174
+ uint32_t wg_x = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size;
1175
+ return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
1176
+ }
1177
+
1178
+ static webgpu_command ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
1179
+ const int split = (src1 != nullptr);
1180
+
1181
+ std::vector<uint32_t> params = {
1182
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
1183
+ src1 != nullptr ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)) : 0,
1184
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
1185
+ (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
1186
+ (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
1187
+ (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
1188
+ src1 != nullptr ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) :
1189
+ (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
1190
+ src1 != nullptr ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) :
1191
+ (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
1192
+ src1 != nullptr ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) :
1193
+ (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
1194
+ (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
1195
+ (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
1196
+ (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
1197
+ (uint32_t) ggml_nelements(dst),
1198
+ (uint32_t) dst->ne[0],
1199
+ (uint32_t) dst->ne[1],
1200
+ (uint32_t) dst->ne[2],
1201
+ (uint32_t) ((int32_t *) dst->op_params)[1], // swapped
1202
+ *(uint32_t *) &dst->op_params[2], // alpha, for swiglu_oai
1203
+ *(uint32_t *) &dst->op_params[3], // limit, for swiglu_oai
1204
+ };
1205
+
1206
+ std::vector<wgpu::BindGroupEntry> entries = {
1207
+ { .binding = 0,
1208
+ .buffer = ggml_webgpu_tensor_buf(src0),
1209
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src0),
1210
+ .size = ggml_webgpu_tensor_binding_size(ctx, src0) },
1211
+ };
1212
+ uint32_t dst_binding = 1;
1213
+ if (split) {
1214
+ dst_binding = 2;
1215
+ entries.push_back({ .binding = 1,
1216
+ .buffer = ggml_webgpu_tensor_buf(src1),
1217
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src1),
1218
+ .size = ggml_webgpu_tensor_binding_size(ctx, src1) });
1219
+ }
1220
+ entries.push_back({ .binding = dst_binding,
1221
+ .buffer = ggml_webgpu_tensor_buf(dst),
1222
+ .offset = ggml_webgpu_tensor_align_offset(ctx, dst),
1223
+ .size = ggml_webgpu_tensor_binding_size(ctx, dst) });
1224
+
1225
+ webgpu_pipeline pipeline = ctx->glu_pipeline[ggml_get_glu_op(dst)][dst->type][split];
1226
+ size_t max_wg_size = ctx->max_wg_size_x;
1227
+ uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
1228
+ return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
1229
+ }
1230
+
1231
+ static webgpu_command ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
1232
+ int inplace = ggml_webgpu_tensor_equal(src, dst);
1233
+
1234
+ std::vector<uint32_t> params = {
1235
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
1236
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
1237
+ (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
1238
+ (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
1239
+ (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
1240
+ (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
1241
+ (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
1242
+ (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
1243
+ (uint32_t) ggml_nelements(dst),
1244
+ (uint32_t) src->ne[0],
1245
+ (uint32_t) src->ne[1],
1246
+ (uint32_t) src->ne[2],
1247
+ *(uint32_t *) dst->op_params, // scale
1248
+ *(uint32_t *) &dst->op_params[1] // bias
1249
+ };
1250
+
1251
+ std::vector<wgpu::BindGroupEntry> entries = {
1252
+ { .binding = 0,
1253
+ .buffer = ggml_webgpu_tensor_buf(src),
1254
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src),
1255
+ .size = ggml_webgpu_tensor_binding_size(ctx, src) }
1256
+ };
1257
+ if (!inplace) {
1258
+ entries.push_back({ .binding = 1,
1259
+ .buffer = ggml_webgpu_tensor_buf(dst),
1260
+ .offset = ggml_webgpu_tensor_align_offset(ctx, dst),
1261
+ .size = ggml_webgpu_tensor_binding_size(ctx, dst) });
1262
+ }
1263
+
1264
+ size_t max_wg_size = ctx->max_wg_size_x;
1265
+ uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
1266
+ return ggml_backend_webgpu_build(ctx, ctx->scale_pipeline[inplace], params, entries, wg_x);
1267
+ }
1268
+
1269
+ static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx,
1270
+ ggml_tensor * src0,
1271
+ ggml_tensor * src1,
1272
+ ggml_tensor * src2,
1273
+ ggml_tensor * dst) {
1274
+ const int inplace = ggml_webgpu_tensor_equal(src0, dst);
1275
+ const int mask_type = (src1 != nullptr) ? src1->type : 2; // use 2 for no mask here
1276
+ const int has_sink = (src2 != nullptr);
1277
+ float max_bias;
1278
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
1279
+ float n_head_log2 = float(1u << (uint32_t) floor(log2(src0->ne[2])));
1280
+ float m0 = powf(2.0f, -(max_bias) / n_head_log2);
1281
+ float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1282
+
1283
+ std::vector<uint32_t> params = {
1284
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
1285
+ mask_type < 2 ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)) : 0,
1286
+ has_sink ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src2) / ggml_type_size(src2->type)) : 0,
1287
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
1288
+ (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
1289
+ (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
1290
+ (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
1291
+ mask_type < 2 ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) : 0,
1292
+ mask_type < 2 ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) : 0,
1293
+ mask_type < 2 ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) : 0,
1294
+ (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
1295
+ (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
1296
+ (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
1297
+ (uint32_t) ggml_nelements(dst),
1298
+ (uint32_t) src0->ne[0],
1299
+ (uint32_t) src0->ne[1],
1300
+ (uint32_t) src0->ne[2],
1301
+ mask_type < 2 ? (uint32_t) src1->ne[2] : 0,
1302
+ mask_type < 2 ? (uint32_t) src1->ne[3] : 0,
1303
+ *(uint32_t *) dst->op_params, // scale
1304
+ *(uint32_t *) &max_bias,
1305
+ *(uint32_t *) &n_head_log2,
1306
+ *(uint32_t *) &m0,
1307
+ *(uint32_t *) &m1
1308
+ };
1309
+
1310
+ std::vector<wgpu::BindGroupEntry> entries = {
1311
+ { .binding = 0,
1312
+ .buffer = ggml_webgpu_tensor_buf(src0),
1313
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src0),
1314
+ .size = ggml_webgpu_tensor_binding_size(ctx, src0) }
1315
+ };
1316
+ uint32_t binding_num = 1;
1317
+ if (mask_type < 2) {
1318
+ entries.push_back({ .binding = binding_num,
1319
+ .buffer = ggml_webgpu_tensor_buf(src1),
1320
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src1),
1321
+ .size = ggml_webgpu_tensor_binding_size(ctx, src1) });
1322
+ binding_num++;
1323
+ }
1324
+ if (has_sink) {
1325
+ entries.push_back({ .binding = binding_num,
1326
+ .buffer = ggml_webgpu_tensor_buf(src2),
1327
+ .offset = ggml_webgpu_tensor_align_offset(ctx, src2),
1328
+ .size = ggml_webgpu_tensor_binding_size(ctx, src2) });
1329
+ binding_num++;
1330
+ }
1331
+ if (!inplace) {
1332
+ entries.push_back({ .binding = binding_num,
1333
+ .buffer = ggml_webgpu_tensor_buf(dst),
1334
+ .offset = ggml_webgpu_tensor_align_offset(ctx, dst),
1335
+ .size = ggml_webgpu_tensor_binding_size(ctx, dst) });
1336
+ }
1337
+
1338
+ return ggml_backend_webgpu_build(ctx, ctx->soft_max_pipeline[mask_type][has_sink][inplace], params, entries,
1339
+ ggml_nrows(dst));
597
1340
  }
598
1341
 
599
- // Returns true if node has enqueued work into the queue, false otherwise
600
- static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
1342
+ // Returns the encoded command, or std::nullopt if the operation is a no-op
1343
+ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
601
1344
  if (ggml_is_empty(node)) {
602
- return false;
1345
+ return std::nullopt;
603
1346
  }
604
1347
  WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
605
1348
 
606
1349
  ggml_tensor * src0 = node->src[0];
607
1350
  ggml_tensor * src1 = node->src[1];
1351
+ ggml_tensor * src2 = node->src[2];
608
1352
 
609
1353
  switch (node->op) {
610
1354
  // no-ops
611
1355
  case GGML_OP_NONE:
612
1356
  case GGML_OP_VIEW:
613
1357
  case GGML_OP_PERMUTE:
614
- return false;
1358
+ case GGML_OP_TRANSPOSE:
1359
+ case GGML_OP_RESHAPE:
1360
+ return std::nullopt;
615
1361
  case GGML_OP_CPY:
1362
+ case GGML_OP_CONT:
1363
+ return ggml_webgpu_cpy(ctx, src0, node);
1364
+ case GGML_OP_SET_ROWS:
1365
+ return ggml_webgpu_set_rows(ctx, src0, src1, node);
1366
+ case GGML_OP_GET_ROWS:
1367
+ return ggml_webgpu_get_rows(ctx, src0, src1, node);
1368
+ case GGML_OP_MUL_MAT:
1369
+ return ggml_webgpu_mul_mat(ctx, src0, src1, node);
1370
+ case GGML_OP_ADD:
616
1371
  {
617
- ggml_webgpu_cpy(ctx, src0, node);
618
- break;
1372
+ int inplace = ggml_webgpu_tensor_equal(src0, node);
1373
+ return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace);
619
1374
  }
620
- case GGML_OP_SET_ROWS:
1375
+ case GGML_OP_SUB:
621
1376
  {
622
- ggml_webgpu_set_rows(ctx, src0, src1, node);
623
- break;
1377
+ int inplace = ggml_webgpu_tensor_equal(src0, node);
1378
+ return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace);
624
1379
  }
625
- case GGML_OP_MUL_MAT:
1380
+ case GGML_OP_MUL:
626
1381
  {
627
- ggml_webgpu_mul_mat(ctx, src0, src1, node);
628
- break;
1382
+ int inplace = ggml_webgpu_tensor_equal(src0, node);
1383
+ return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace);
1384
+ }
1385
+ case GGML_OP_DIV:
1386
+ {
1387
+ int inplace = ggml_webgpu_tensor_equal(src0, node);
1388
+ return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace);
629
1389
  }
1390
+ case GGML_OP_RMS_NORM:
1391
+ return ggml_webgpu_rms_norm(ctx, src0, node);
1392
+ case GGML_OP_ROPE:
1393
+ return ggml_webgpu_rope(ctx, src0, src1, src2, node);
1394
+ case GGML_OP_GLU:
1395
+ return ggml_webgpu_glu(ctx, src0, src1, node);
1396
+ case GGML_OP_SCALE:
1397
+ return ggml_webgpu_scale(ctx, src0, node);
1398
+ case GGML_OP_SOFT_MAX:
1399
+ return ggml_webgpu_soft_max(ctx, src0, src1, src2, node);
630
1400
  default:
631
- return false;
1401
+ return std::nullopt;
632
1402
  }
633
- return true;
634
1403
  }
635
1404
 
636
1405
  static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
@@ -639,13 +1408,35 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
639
1408
  ggml_backend_webgpu_context * backend_ctx = static_cast<ggml_backend_webgpu_context *>(backend->context);
640
1409
  webgpu_context ctx = backend_ctx->webgpu_ctx;
641
1410
 
642
- for (int i = 0; i < cgraph->n_nodes; i++) {
643
- ggml_webgpu_encode_node(ctx, cgraph->nodes[i]);
644
- }
1411
+ WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);
645
1412
 
646
- ggml_backend_webgpu_submit_queue(ctx);
647
- ggml_backend_webgpu_wait_on_submission(ctx);
1413
+ ctx->inflight_threads++;
648
1414
 
1415
+ std::vector<webgpu_command> commands;
1416
+ std::vector<webgpu_submission_futures> futures;
1417
+ for (int i = 0; i < cgraph->n_nodes; i++) {
1418
+ if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
1419
+ commands.push_back(*cmd);
1420
+ }
1421
+ // compute the batch size based on the number of inflight threads
1422
+ uint inflight_threads = ctx->inflight_threads;
1423
+ uint batch_size = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
1424
+ WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
1425
+ if (commands.size() >= batch_size) {
1426
+ futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
1427
+ // Process events and check for completed submissions
1428
+ ctx->instance.ProcessEvents();
1429
+ ggml_backend_webgpu_wait(ctx, futures, false);
1430
+ commands.clear();
1431
+ }
1432
+ }
1433
+ if (!commands.empty()) {
1434
+ webgpu_submission_futures new_futures = ggml_backend_webgpu_submit(ctx, commands);
1435
+ futures.push_back(new_futures);
1436
+ }
1437
+ ggml_backend_webgpu_wait(ctx, futures);
1438
+ ctx->inflight_threads--;
1439
+ WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx);
649
1440
  return GGML_STATUS_SUCCESS;
650
1441
  }
651
1442
 
@@ -663,6 +1454,7 @@ static ggml_backend_i ggml_backend_webgpu_i = {
663
1454
  /* .graph_compute = */ ggml_backend_webgpu_graph_compute,
664
1455
  /* .event_record = */ NULL,
665
1456
  /* .event_wait = */ NULL,
1457
+ /* .graph_optimize = */ NULL,
666
1458
  };
667
1459
 
668
1460
  /* End GGML Backend Interface */
@@ -670,7 +1462,6 @@ static ggml_backend_i ggml_backend_webgpu_i = {
670
1462
  /* GGML Backend Buffer Interface */
671
1463
 
672
1464
  static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
673
- WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()");
674
1465
  ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
675
1466
  ctx->buffer.Destroy();
676
1467
  }
@@ -691,16 +1482,19 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe
691
1482
  return;
692
1483
  }
693
1484
 
694
- WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", "
695
- << offset << ", " << size << ")");
1485
+ WEBGPU_CPU_PROFILE_TOTAL_START(memset_tensor);
696
1486
 
697
1487
  ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
698
1488
 
1489
+ WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buf_ctx->label << ", " << tensor << ", " << value
1490
+ << ", " << offset << ", " << size << ")");
1491
+
699
1492
  size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
700
1493
 
701
1494
  // This is a trick to set all bytes of a u32 to the same 1 byte value.
702
1495
  uint32_t val32 = (uint32_t) value * 0x01010101;
703
1496
  ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, val32, total_offset, size);
1497
+ WEBGPU_CPU_PROFILE_TOTAL_END(memset_tensor, buf_ctx->webgpu_ctx);
704
1498
  }
705
1499
 
706
1500
  static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
@@ -708,11 +1502,13 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
708
1502
  const void * data,
709
1503
  size_t offset,
710
1504
  size_t size) {
711
- WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", "
712
- << offset << ", " << size << ")");
1505
+ WEBGPU_CPU_PROFILE_TOTAL_START(set_tensor);
713
1506
  ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
714
1507
  webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
715
1508
 
1509
+ WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buf_ctx->label << ", " << tensor << ", " << data
1510
+ << ", " << offset << ", " << size << ")");
1511
+
716
1512
  size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
717
1513
 
718
1514
  webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4);
@@ -728,12 +1524,21 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
728
1524
  ((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i];
729
1525
  }
730
1526
  // memset the remaining bytes
731
- ggml_backend_webgpu_buffer_memset(
732
- webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size), remaining_size);
1527
+ ggml_backend_webgpu_buffer_memset(webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size),
1528
+ remaining_size);
733
1529
  } else {
734
1530
  // wait for WriteBuffer to complete
735
- ggml_backend_webgpu_wait_on_submission(webgpu_ctx);
1531
+ webgpu_ctx->instance.WaitAny(
1532
+ webgpu_ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
1533
+ [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
1534
+ if (status != wgpu::QueueWorkDoneStatus::Success) {
1535
+ GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
1536
+ std::string(message).c_str());
1537
+ }
1538
+ }),
1539
+ UINT64_MAX);
736
1540
  }
1541
+ WEBGPU_CPU_PROFILE_TOTAL_END(set_tensor, webgpu_ctx);
737
1542
  }
738
1543
 
739
1544
  static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
@@ -741,12 +1546,12 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
741
1546
  void * data,
742
1547
  size_t offset,
743
1548
  size_t size) {
744
- WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", "
745
- << offset << ", " << size << ")");
746
-
747
- ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
748
- webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
749
- wgpu::Device device = webgpu_ctx->device;
1549
+ WEBGPU_CPU_PROFILE_TOTAL_START(get_tensor);
1550
+ ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
1551
+ WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buf_ctx->label << ", " << tensor << ", " << data
1552
+ << ", " << offset << ", " << size << ")");
1553
+ webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
1554
+ wgpu::Device device = webgpu_ctx->device;
750
1555
 
751
1556
  size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
752
1557
 
@@ -763,11 +1568,8 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
763
1568
  if (webgpu_ctx->get_tensor_staging_buf) {
764
1569
  webgpu_ctx->get_tensor_staging_buf.Destroy();
765
1570
  }
766
- ggml_webgpu_create_buffer(device,
767
- webgpu_ctx->get_tensor_staging_buf,
768
- final_size,
769
- wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
770
- "get_tensor_staging_buf");
1571
+ ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size,
1572
+ wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "get_tensor_staging_buf");
771
1573
  }
772
1574
 
773
1575
  // Copy the data from the buffer to the staging buffer
@@ -786,12 +1588,15 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
786
1588
  // Copy the data from the mapped range to the output buffer
787
1589
  std::memcpy(data, mapped_range, size);
788
1590
  webgpu_ctx->get_tensor_staging_buf.Unmap();
1591
+ WEBGPU_CPU_PROFILE_TOTAL_END(get_tensor, webgpu_ctx);
789
1592
  }
790
1593
 
791
1594
  static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
792
1595
  WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t) value << ")");
1596
+ WEBGPU_CPU_PROFILE_TOTAL_START(clear);
793
1597
  ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
794
1598
  ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buffer->size);
1599
+ WEBGPU_CPU_PROFILE_TOTAL_END(clear, buf_ctx->webgpu_ctx);
795
1600
  }
796
1601
 
797
1602
  static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
@@ -817,17 +1622,20 @@ static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer
817
1622
 
818
1623
  static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
819
1624
  size_t size) {
820
- WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")");
1625
+ static std::atomic<int> buffer_count;
1626
+ int buffer_id = buffer_count++;
1627
+ std::string buf_name = "tensor_buf" + std::to_string(buffer_id);
1628
+ WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer_" << buffer_id << ": " << size << " bytes");
821
1629
  ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
822
1630
 
823
1631
  wgpu::Buffer buf;
824
- ggml_webgpu_create_buffer(ctx->webgpu_ctx->device,
825
- buf,
1632
+ ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf,
826
1633
  (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1),
827
1634
  wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst,
828
- "allocated_buffer");
1635
+ buf_name.c_str());
829
1636
 
830
- ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
1637
+ ggml_backend_webgpu_buffer_context * buf_ctx =
1638
+ new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf, buf_name);
831
1639
 
832
1640
  return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size);
833
1641
  }
@@ -887,9 +1695,17 @@ static ggml_guid_t ggml_backend_webgpu_guid(void) {
887
1695
  return reinterpret_cast<ggml_guid_t>((void *) guid_str);
888
1696
  }
889
1697
 
1698
+ // Workgroup size is a common constant
1699
+ static std::vector<wgpu::ConstantEntry> ggml_webgpu_wg_size_entry(uint32_t wg_size) {
1700
+ std::vector<wgpu::ConstantEntry> constants(1);
1701
+ constants[0].key = "wg_size";
1702
+ constants[0].value = wg_size;
1703
+ return constants;
1704
+ }
1705
+
890
1706
  static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
891
1707
  // we use the maximum workgroup size for the memset pipeline
892
- size_t max_wg_size = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
1708
+ size_t max_wg_size = webgpu_ctx->max_wg_size_x;
893
1709
  size_t max_threads = max_wg_size * webgpu_ctx->limits.maxComputeWorkgroupsPerDimension;
894
1710
  // Size the bytes_per_thread so that the largest buffer size can be handled
895
1711
  webgpu_ctx->memset_bytes_per_thread =
@@ -903,109 +1719,411 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
903
1719
  }
904
1720
 
905
1721
  static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
906
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
907
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
908
- wgsl_mul_mat_f32_f32,
909
- "mul_mat_f32_f32");
910
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
911
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
912
- wgsl_mul_mat_f16_f16,
913
- "mul_mat_f16_f16");
914
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
915
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
916
- wgsl_mul_mat_f16_f32,
917
- "mul_mat_f16_f32");
918
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
919
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
920
- wgsl_mul_mat_q4_0_f32,
921
- "mul_mat_q4_0_f32");
922
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
923
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
924
- wgsl_mul_mat_q4_1_f32,
925
- "mul_mat_q4_1_f32");
926
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
927
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32],
928
- wgsl_mul_mat_q5_0_f32,
929
- "mul_mat_q5_0_f32");
930
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
931
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32],
932
- wgsl_mul_mat_q5_1_f32,
933
- "mul_mat_q5_1_f32");
934
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
935
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32],
936
- wgsl_mul_mat_q8_0_f32,
937
- "mul_mat_q8_0_f32");
938
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
939
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32],
940
- wgsl_mul_mat_q2_k_f32,
941
- "mul_mat_q2_k_f32");
942
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
943
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32],
944
- wgsl_mul_mat_q3_k_f32,
945
- "mul_mat_q3_k_f32");
946
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
947
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32],
948
- wgsl_mul_mat_q4_k_f32,
949
- "mul_mat_q4_k_f32");
950
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
951
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32],
952
- wgsl_mul_mat_q5_k_f32,
953
- "mul_mat_q5_k_f32");
954
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
955
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32],
956
- wgsl_mul_mat_q6_k_f32,
957
- "mul_mat_q6_k_f32");
958
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
959
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32],
960
- wgsl_mul_mat_iq2_xxs_f32,
961
- "mul_mat_iq2_xxs_f32");
962
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
963
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32],
964
- wgsl_mul_mat_iq2_xs_f32,
965
- "mul_mat_iq2_xs_f32");
966
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
967
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32],
968
- wgsl_mul_mat_iq2_s_f32,
969
- "mul_mat_iq2_s_f32");
970
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
971
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32],
972
- wgsl_mul_mat_iq3_xxs_f32,
973
- "mul_mat_iq3_xxs_f32");
974
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
975
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32],
976
- wgsl_mul_mat_iq3_s_f32,
977
- "mul_mat_iq3_s_f32");
978
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
979
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32],
980
- wgsl_mul_mat_iq1_s_f32,
981
- "mul_mat_iq1_s_f32");
982
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
983
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32],
984
- wgsl_mul_mat_iq1_m_f32,
985
- "mul_mat_iq1_m_f32");
986
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
987
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32],
988
- wgsl_mul_mat_iq4_nl_f32,
989
- "mul_mat_iq4_nl_f32");
990
- ggml_webgpu_create_pipeline(webgpu_ctx->device,
991
- webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
992
- wgsl_mul_mat_iq4_xs_f32,
993
- "mul_mat_iq4_xs_f32");
1722
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
1723
+ wgsl_mul_mat_q4_0_f32, "mul_mat_q4_0_f32");
1724
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
1725
+ wgsl_mul_mat_q4_1_f32, "mul_mat_q4_1_f32");
1726
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32],
1727
+ wgsl_mul_mat_q5_0_f32, "mul_mat_q5_0_f32");
1728
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32],
1729
+ wgsl_mul_mat_q5_1_f32, "mul_mat_q5_1_f32");
1730
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32],
1731
+ wgsl_mul_mat_q8_0_f32, "mul_mat_q8_0_f32");
1732
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32],
1733
+ wgsl_mul_mat_q2_k_f32, "mul_mat_q2_k_f32");
1734
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32],
1735
+ wgsl_mul_mat_q3_k_f32, "mul_mat_q3_k_f32");
1736
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32],
1737
+ wgsl_mul_mat_q4_k_f32, "mul_mat_q4_k_f32");
1738
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32],
1739
+ wgsl_mul_mat_q5_k_f32, "mul_mat_q5_k_f32");
1740
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32],
1741
+ wgsl_mul_mat_q6_k_f32, "mul_mat_q6_k_f32");
1742
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32],
1743
+ wgsl_mul_mat_iq2_xxs_f32, "mul_mat_iq2_xxs_f32");
1744
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32],
1745
+ wgsl_mul_mat_iq2_xs_f32, "mul_mat_iq2_xs_f32");
1746
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32],
1747
+ wgsl_mul_mat_iq2_s_f32, "mul_mat_iq2_s_f32");
1748
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32],
1749
+ wgsl_mul_mat_iq3_xxs_f32, "mul_mat_iq3_xxs_f32");
1750
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32],
1751
+ wgsl_mul_mat_iq3_s_f32, "mul_mat_iq3_s_f32");
1752
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32],
1753
+ wgsl_mul_mat_iq1_s_f32, "mul_mat_iq1_s_f32");
1754
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32],
1755
+ wgsl_mul_mat_iq1_m_f32, "mul_mat_iq1_m_f32");
1756
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32],
1757
+ wgsl_mul_mat_iq4_nl_f32, "mul_mat_iq4_nl_f32");
1758
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
1759
+ wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
1760
+
1761
+ if (webgpu_ctx->supports_subgroup_matrix) {
1762
+ std::map<std::string, std::string> sg_matrix_repls;
1763
+ sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
1764
+ sg_matrix_repls["WEBGPU_TILE_K"] = std::to_string(WEBGPU_MUL_MAT_TILE_K);
1765
+ sg_matrix_repls["WEBGPU_SUBGROUP_M"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_M);
1766
+ sg_matrix_repls["WEBGPU_SUBGROUP_N"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_N);
1767
+ sg_matrix_repls["WEBGPU_SUBGROUP_MATRIX_M"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M);
1768
+ sg_matrix_repls["WEBGPU_SUBGROUP_MATRIX_N"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N);
1769
+ sg_matrix_repls["WEBGPU_SG_MAT_M_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.M);
1770
+ sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
1771
+ sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
1772
+
1773
+ std::string proc_mul_mat_subgroup_matrix_f32_f32 =
1774
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
1775
+ std::string proc_mul_mat_subgroup_matrix_f32_f32_vec =
1776
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
1777
+ std::string proc_mul_mat_subgroup_matrix_f16_f32 =
1778
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
1779
+ std::string proc_mul_mat_subgroup_matrix_f16_f32_vec =
1780
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
1781
+ std::string proc_mul_mat_subgroup_matrix_f16_f16 =
1782
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
1783
+ std::string proc_mul_mat_subgroup_matrix_f16_f16_vec =
1784
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
1785
+ std::string proc_mul_mat_subgroup_matrix_q4_0_f32 =
1786
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
1787
+ std::string proc_mul_mat_subgroup_matrix_q4_0_f32_vec =
1788
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
1789
+
1790
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
1791
+ webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32.c_str(), "mul_mat_subgroup_matrix_f32_f32");
1792
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
1793
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32_vec.c_str(),
1794
+ "mul_mat_subgroup_matrix_f32_f32_vec");
1795
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
1796
+ webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32.c_str(), "mul_mat_subgroup_matrix_f16_f32");
1797
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
1798
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32_vec.c_str(),
1799
+ "mul_mat_subgroup_matrix_f16_f32_vec");
1800
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
1801
+ webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16.c_str(), "mul_mat_subgroup_matrix_f16_f16");
1802
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
1803
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16_vec.c_str(),
1804
+ "mul_mat_subgroup_matrix_f16_f16_vec");
1805
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
1806
+ webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32.c_str(), "mul_mat_subgroup_matrix_q4_0_f32");
1807
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
1808
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32_vec.c_str(),
1809
+ "mul_mat_subgroup_matrix_q4_0_f32_vec");
1810
+ } else {
1811
+ std::vector<wgpu::ConstantEntry> mul_mat_reg_tile_constants(3);
1812
+ mul_mat_reg_tile_constants[0].key = "TILE_K";
1813
+ mul_mat_reg_tile_constants[0].value = WEBGPU_MUL_MAT_TILE_K;
1814
+ mul_mat_reg_tile_constants[1].key = "WORKGROUP_SIZE_M";
1815
+ mul_mat_reg_tile_constants[1].value = WEBGPU_MUL_MAT_WG_SIZE_M;
1816
+ mul_mat_reg_tile_constants[2].key = "WORKGROUP_SIZE_N";
1817
+ mul_mat_reg_tile_constants[2].value = WEBGPU_MUL_MAT_WG_SIZE_N;
1818
+
1819
+ std::map<std::string, std::string> reg_repls;
1820
+ reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
1821
+ reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);
1822
+
1823
+ // Process each reg-tile shader with tile replacements.
1824
+ // Keep the processed strings in-scope so .c_str() remains valid.
1825
+ std::string proc_mul_mat_reg_tile_f32_f32 =
1826
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
1827
+ std::string proc_mul_mat_reg_tile_f32_f32_vec =
1828
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
1829
+ std::string proc_mul_mat_reg_tile_f16_f32 =
1830
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
1831
+ std::string proc_mul_mat_reg_tile_f16_f32_vec =
1832
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
1833
+ std::string proc_mul_mat_reg_tile_f16_f16 =
1834
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
1835
+ std::string proc_mul_mat_reg_tile_f16_f16_vec =
1836
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
1837
+ std::string proc_mul_mat_reg_tile_q4_0_f32 =
1838
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
1839
+ std::string proc_mul_mat_reg_tile_q4_0_f32_vec =
1840
+ ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
1841
+
1842
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
1843
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32.c_str(),
1844
+ "mul_mat_reg_tile_f32_f32", mul_mat_reg_tile_constants);
1845
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
1846
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32_vec.c_str(),
1847
+ "mul_mat_reg_tile_f32_f32_vec", mul_mat_reg_tile_constants);
1848
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
1849
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32.c_str(),
1850
+ "mul_mat_reg_tile_f16_f32", mul_mat_reg_tile_constants);
1851
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
1852
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32_vec.c_str(),
1853
+ "mul_mat_reg_tile_f16_f32_vec", mul_mat_reg_tile_constants);
1854
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
1855
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16.c_str(),
1856
+ "mul_mat_reg_tile_f16_f16", mul_mat_reg_tile_constants);
1857
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
1858
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16_vec.c_str(),
1859
+ "mul_mat_reg_tile_f16_f16_vec", mul_mat_reg_tile_constants);
1860
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
1861
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32.c_str(),
1862
+ "mul_mat_reg_tile_q4_0_f32", mul_mat_reg_tile_constants);
1863
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
1864
+ ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
1865
+ "mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
1866
+ }
1867
+
1868
+ std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
1869
+ mul_mat_vec_constants[0].key = "WORKGROUP_SIZE";
1870
+ mul_mat_vec_constants[0].value = WEBGPU_MUL_MAT_VEC_WG_SIZE;
1871
+ mul_mat_vec_constants[1].key = "TILE_K";
1872
+ mul_mat_vec_constants[1].value = WEBGPU_MUL_MAT_VEC_TILE_K;
1873
+ mul_mat_vec_constants[2].key = "OUTPUTS_PER_WG";
1874
+ mul_mat_vec_constants[2].value = WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
1875
+
1876
+ webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
1877
+ webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32, "mul_mat_vec_f32_f32", mul_mat_vec_constants);
1878
+ webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
1879
+ webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32_vec, "mul_mat_vec_f32_f32_vec", mul_mat_vec_constants);
1880
+ webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
1881
+ webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32, "mul_mat_vec_f16_f32", mul_mat_vec_constants);
1882
+ webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
1883
+ webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32_vec, "mul_mat_vec_f16_f32_vec", mul_mat_vec_constants);
1884
+ webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
1885
+ webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16, "mul_mat_vec_f16_f16", mul_mat_vec_constants);
1886
+ webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] = ggml_webgpu_create_pipeline2(
1887
+ webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16_vec, "mul_mat_vec_f16_f16_vec", mul_mat_vec_constants);
1888
+ webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
1889
+ webgpu_ctx->device, wgsl_mul_mat_vec_q4_0_f32, "mul_mat_vec_q4_0_f32", mul_mat_vec_constants);
994
1890
  }
995
1891
 
996
1892
  static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
997
- std::vector<wgpu::ConstantEntry> constants(1);
998
- constants[0].key = "wg_size";
999
- constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
1000
- ggml_webgpu_create_pipeline(
1001
- webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", constants);
1893
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][0], wgsl_set_rows_f16,
1894
+ "set_rows_f16", ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
1895
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][1], wgsl_set_rows_f16_vec,
1896
+ "set_rows_f16_vec", ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
1897
+ }
1898
+
1899
+ static void ggml_webgpu_init_get_rows_pipeline(webgpu_context & webgpu_ctx) {
1900
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
1901
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_F32], wgsl_get_rows_f32_vec,
1902
+ "get_rows_f32_vec", constants);
1903
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_f32_no_vec_pipeline, wgsl_get_rows_f32,
1904
+ "get_rows_f32", constants);
1905
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_F16], wgsl_get_rows_f16,
1906
+ "get_rows_f16", constants);
1907
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_I32], wgsl_get_rows_i32,
1908
+ "get_rows_i32", constants);
1909
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q4_0], wgsl_get_rows_q4_0,
1910
+ "get_rows_q4_0", constants);
1911
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q4_1], wgsl_get_rows_q4_1,
1912
+ "get_rows_q4_1", constants);
1913
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q5_0], wgsl_get_rows_q5_0,
1914
+ "get_rows_q5_0", constants);
1915
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q5_1], wgsl_get_rows_q5_1,
1916
+ "get_rows_q5_1", constants);
1917
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q8_0], wgsl_get_rows_q8_0,
1918
+ "get_rows_q8_0", constants);
1919
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q2_K], wgsl_get_rows_q2_k,
1920
+ "get_rows_q2_k", constants);
1921
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q3_K], wgsl_get_rows_q3_k,
1922
+ "get_rows_q3_k", constants);
1923
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q4_K], wgsl_get_rows_q4_k,
1924
+ "get_rows_q4_k", constants);
1925
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q5_K], wgsl_get_rows_q5_k,
1926
+ "get_rows_q5_k", constants);
1927
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q6_K], wgsl_get_rows_q6_k,
1928
+ "get_rows_q6_k", constants);
1929
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ2_XXS],
1930
+ wgsl_get_rows_iq2_xxs, "get_rows_iq2_xxs", constants);
1931
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ2_XS],
1932
+ wgsl_get_rows_iq2_xs, "get_rows_iq2_xs", constants);
1933
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ2_S], wgsl_get_rows_iq2_s,
1934
+ "get_rows_iq2_s", constants);
1935
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ3_XXS],
1936
+ wgsl_get_rows_iq3_xxs, "get_rows_iq3_xxs", constants);
1937
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ3_S], wgsl_get_rows_iq3_s,
1938
+ "get_rows_iq3_s", constants);
1939
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ1_S], wgsl_get_rows_iq1_s,
1940
+ "get_rows_iq1_s", constants);
1941
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ1_M], wgsl_get_rows_iq1_m,
1942
+ "get_rows_iq1_m", constants);
1943
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ4_NL],
1944
+ wgsl_get_rows_iq4_nl, "get_rows_iq4_nl", constants);
1945
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ4_XS],
1946
+ wgsl_get_rows_iq4_xs, "get_rows_iq4_xs", constants);
1002
1947
  }
1003
1948
 
1004
1949
  static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
1005
- std::vector<wgpu::ConstantEntry> constants(1);
1006
- constants[0].key = "wg_size";
1007
- constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
1008
- ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline, wgsl_cpy, "cpy", constants);
1950
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
1951
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
1952
+ wgsl_cpy_f32_f32, "cpy_f32_f32", constants);
1953
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F32][GGML_TYPE_F16],
1954
+ wgsl_cpy_f32_f16, "cpy_f32_f16", constants);
1955
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
1956
+ wgsl_cpy_f16_f32, "cpy_f16_f32", constants);
1957
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
1958
+ wgsl_cpy_f16_f16, "cpy_f16_f16", constants);
1959
+ }
1960
+
1961
+ static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) {
1962
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
1963
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0], wgsl_add_f32, "add_f32",
1964
+ constants);
1965
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0], wgsl_add_f16, "add_f16",
1966
+ constants);
1967
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1], wgsl_add_f32_inplace,
1968
+ "add_f32_inplace", constants);
1969
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1], wgsl_add_f16_inplace,
1970
+ "add_f16_inplace", constants);
1971
+ }
1972
+
1973
+ static void ggml_webgpu_init_sub_pipeline(webgpu_context & webgpu_ctx) {
1974
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
1975
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0], wgsl_sub_f32, "sub_f32",
1976
+ constants);
1977
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0], wgsl_sub_f16, "sub_f16",
1978
+ constants);
1979
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1], wgsl_sub_f32_inplace,
1980
+ "sub_f32_inplace", constants);
1981
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1], wgsl_sub_f16_inplace,
1982
+ "sub_f16_inplace", constants);
1983
+ }
1984
+
1985
+ static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) {
1986
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
1987
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0], wgsl_mul_f32, "mul_f32",
1988
+ constants);
1989
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0], wgsl_mul_f16, "mul_f16",
1990
+ constants);
1991
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1], wgsl_mul_f32_inplace,
1992
+ "mul_f32_inplace", constants);
1993
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1], wgsl_mul_f16_inplace,
1994
+ "mul_f16_inplace", constants);
1995
+ }
1996
+
1997
+ static void ggml_webgpu_init_div_pipeline(webgpu_context & webgpu_ctx) {
1998
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
1999
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0], wgsl_div_f32, "div_f32",
2000
+ constants);
2001
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0], wgsl_div_f16, "div_f16",
2002
+ constants);
2003
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1], wgsl_div_f32_inplace,
2004
+ "div_f32_inplace", constants);
2005
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1], wgsl_div_f16_inplace,
2006
+ "div_f16_inplace", constants);
2007
+ }
2008
+
2009
+ static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
2010
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(WEBGPU_ROW_SPLIT_WG_SIZE);
2011
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_pipeline[0], wgsl_rms_norm, "rms_norm",
2012
+ constants);
2013
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_pipeline[1], wgsl_rms_norm_inplace,
2014
+ "rms_norm_inplace", constants);
2015
+ }
2016
+
2017
+ static void ggml_webgpu_init_rope_pipeline(webgpu_context & webgpu_ctx) {
2018
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
2019
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0][0], wgsl_rope_f32,
2020
+ "rope_f32", constants);
2021
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0][1],
2022
+ wgsl_rope_f32_inplace, "rope_f32_inplace", constants);
2023
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][1][0], wgsl_rope_f32_ff,
2024
+ "rope_f32_ff", constants);
2025
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][1][1],
2026
+ wgsl_rope_f32_ff_inplace, "rope_f32_ff_inplace", constants);
2027
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0][0], wgsl_rope_f16,
2028
+ "rope_f16", constants);
2029
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0][1],
2030
+ wgsl_rope_f16_inplace, "rope_f16_inplace", constants);
2031
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][1][0], wgsl_rope_f16_ff,
2032
+ "rope_f16_ff", constants);
2033
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][1][1],
2034
+ wgsl_rope_f16_ff_inplace, "rope_f16_ff_inplace", constants);
2035
+ }
2036
+
2037
+ static void ggml_webgpu_init_glu_pipeline(webgpu_context & webgpu_ctx) {
2038
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
2039
+ // reglu
2040
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F32][0],
2041
+ wgsl_reglu_f32, "reglu_f32", constants);
2042
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F16][0],
2043
+ wgsl_reglu_f16, "reglu_f16", constants);
2044
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F32][1],
2045
+ wgsl_reglu_f32_split, "reglu_f32_split", constants);
2046
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F16][1],
2047
+ wgsl_reglu_f16_split, "reglu_f16_split", constants);
2048
+ // geglu
2049
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F32][0],
2050
+ wgsl_geglu_f32, "geglu_f32", constants);
2051
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F16][0],
2052
+ wgsl_geglu_f16, "geglu_f16", constants);
2053
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F32][1],
2054
+ wgsl_geglu_f32_split, "geglu_f32_split", constants);
2055
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F16][1],
2056
+ wgsl_geglu_f16_split, "geglu_f16_split", constants);
2057
+ // swiglu
2058
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F32][0],
2059
+ wgsl_swiglu_f32, "swiglu_f32", constants);
2060
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F16][0],
2061
+ wgsl_swiglu_f16, "swiglu_f16", constants);
2062
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F32][1],
2063
+ wgsl_swiglu_f32_split, "swiglu_f32_split", constants);
2064
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F16][1],
2065
+ wgsl_swiglu_f16_split, "swiglu_f16_split", constants);
2066
+ // swiglu_oai
2067
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU_OAI][GGML_TYPE_F32][0],
2068
+ wgsl_swiglu_oai_f32, "swiglu_oai_f32", constants);
2069
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU_OAI][GGML_TYPE_F32][1],
2070
+ wgsl_swiglu_oai_f32_split, "swiglu_oai_f32_split", constants);
2071
+ // geglu_erf
2072
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F32][0],
2073
+ wgsl_geglu_erf_f32, "geglu_erf_f32", constants);
2074
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F16][0],
2075
+ wgsl_geglu_erf_f16, "geglu_erf_f16", constants);
2076
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F32][1],
2077
+ wgsl_geglu_erf_f32_split, "geglu_erf_f32_split", constants);
2078
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F16][1],
2079
+ wgsl_geglu_erf_f16_split, "geglu_erf_f16_split", constants);
2080
+ // geglu_quick
2081
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F32][0],
2082
+ wgsl_geglu_quick_f32, "geglu_quick_f32", constants);
2083
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F16][0],
2084
+ wgsl_geglu_quick_f16, "geglu_quick_f16", constants);
2085
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F32][1],
2086
+ wgsl_geglu_quick_f32_split, "geglu_quick_f32_split", constants);
2087
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F16][1],
2088
+ wgsl_geglu_quick_f16_split, "geglu_quick_f16_split", constants);
2089
+ }
2090
+
2091
+ static void ggml_webgpu_init_scale_pipeline(webgpu_context & webgpu_ctx) {
2092
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
2093
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->scale_pipeline[0], wgsl_scale_f32, "scale_f32",
2094
+ constants);
2095
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->scale_pipeline[1], wgsl_scale_f32_inplace,
2096
+ "scale_f32_inplace", constants);
2097
+ }
2098
+
2099
+ static void ggml_webgpu_init_soft_max_pipeline(webgpu_context & webgpu_ctx) {
2100
+ std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(WEBGPU_ROW_SPLIT_WG_SIZE);
2101
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][0][0], wgsl_soft_max_f32,
2102
+ "soft_max_f32", constants);
2103
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][0][1], wgsl_soft_max_f32_inplace,
2104
+ "soft_max_f32_inplace", constants);
2105
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][1][0], wgsl_soft_max_f32_sink,
2106
+ "soft_max_f32_sink", constants);
2107
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][1][1],
2108
+ wgsl_soft_max_f32_sink_inplace, "soft_max_f32_sink_inplace", constants);
2109
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][0][0], wgsl_soft_max_f32_mask_f32,
2110
+ "soft_max_f32_mask_f32", constants);
2111
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][0][1],
2112
+ wgsl_soft_max_f32_mask_f32_inplace, "soft_max_f32_mask_f32_inplace", constants);
2113
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][0][0], wgsl_soft_max_f32_mask_f16,
2114
+ "soft_max_f32_mask_f16", constants);
2115
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][0][1],
2116
+ wgsl_soft_max_f32_mask_f16_inplace, "soft_max_f32_mask_f16_inplace", constants);
2117
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][1][0],
2118
+ wgsl_soft_max_f32_mask_f32_sink, "soft_max_f32_mask_f32_sink", constants);
2119
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][1][1],
2120
+ wgsl_soft_max_f32_mask_f32_sink_inplace, "soft_max_f32_mask_f32_sink_inplace",
2121
+ constants);
2122
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][1][0],
2123
+ wgsl_soft_max_f32_mask_f16_sink, "soft_max_f32_mask_f16_sink", constants);
2124
+ ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][1][1],
2125
+ wgsl_soft_max_f32_mask_f16_sink_inplace, "soft_max_f32_mask_f16_sink_inplace",
2126
+ constants);
1009
2127
  }
1010
2128
 
1011
2129
  static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -1055,24 +2173,89 @@ static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggm
1055
2173
  return buft->iface.get_name == ggml_backend_webgpu_buffer_type_get_name;
1056
2174
  }
1057
2175
 
2176
+ static bool ggml_webgpu_supported_qtype(ggml_type type) {
2177
+ switch (type) {
2178
+ case GGML_TYPE_Q4_0:
2179
+ case GGML_TYPE_Q4_1:
2180
+ case GGML_TYPE_Q5_0:
2181
+ case GGML_TYPE_Q5_1:
2182
+ case GGML_TYPE_Q8_0:
2183
+ case GGML_TYPE_Q2_K:
2184
+ case GGML_TYPE_Q3_K:
2185
+ case GGML_TYPE_Q4_K:
2186
+ case GGML_TYPE_Q5_K:
2187
+ case GGML_TYPE_Q6_K:
2188
+ case GGML_TYPE_IQ2_XXS:
2189
+ case GGML_TYPE_IQ2_XS:
2190
+ case GGML_TYPE_IQ2_S:
2191
+ case GGML_TYPE_IQ3_XXS:
2192
+ case GGML_TYPE_IQ3_S:
2193
+ case GGML_TYPE_IQ1_S:
2194
+ case GGML_TYPE_IQ1_M:
2195
+ case GGML_TYPE_IQ4_NL:
2196
+ case GGML_TYPE_IQ4_XS:
2197
+ return true;
2198
+ default:
2199
+ return false;
2200
+ }
2201
+ }
2202
+
1058
2203
  static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
1059
- GGML_UNUSED(dev);
2204
+ ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
2205
+
2206
+ webgpu_context webgpu_ctx = ctx->webgpu_ctx;
1060
2207
 
2208
+ ggml_tensor * src0 = op->src[0];
2209
+ ggml_tensor * src1 = op->src[1];
2210
+ ggml_tensor * src2 = op->src[2];
2211
+
2212
+ // on smaller devices (or CI), tensors may be larger than the max storage buffer size
2213
+ if (ggml_nbytes(op) > webgpu_ctx->limits.maxStorageBufferBindingSize ||
2214
+ (src0 != nullptr && ggml_nbytes(src0) > webgpu_ctx->limits.maxStorageBufferBindingSize) ||
2215
+ (src1 != nullptr && ggml_nbytes(src1) > webgpu_ctx->limits.maxStorageBufferBindingSize)) {
2216
+ return false;
2217
+ }
2218
+
2219
+ bool supports_op = false;
1061
2220
  switch (op->op) {
1062
2221
  case GGML_OP_NONE:
1063
2222
  case GGML_OP_VIEW:
1064
2223
  case GGML_OP_PERMUTE:
1065
- return true;
2224
+ case GGML_OP_TRANSPOSE:
2225
+ case GGML_OP_RESHAPE:
2226
+ supports_op = true;
2227
+ break;
2228
+ case GGML_OP_ADD:
2229
+ case GGML_OP_SUB:
2230
+ case GGML_OP_MUL:
2231
+ case GGML_OP_DIV:
2232
+ // TODO: support non-contiguous tensors, e.g. for MOE_EXPERT_REDUCE
2233
+ // see https://github.com/ggml-org/llama.cpp/pull/16857
2234
+ supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type) &&
2235
+ (src1->type == op->type) && ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
2236
+ break;
1066
2237
  case GGML_OP_CPY:
2238
+ case GGML_OP_CONT:
2239
+ supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
2240
+ (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
2241
+ break;
1067
2242
  case GGML_OP_SET_ROWS:
1068
- return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
2243
+ supports_op = (op->type == GGML_TYPE_F16 && src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I64);
2244
+ break;
2245
+ case GGML_OP_GET_ROWS:
2246
+ if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_I32 ||
2247
+ ggml_webgpu_supported_qtype(src0->type)) {
2248
+ supports_op = (op->type == GGML_TYPE_F32);
2249
+ }
2250
+ break;
1069
2251
  case GGML_OP_MUL_MAT:
1070
2252
  {
1071
- switch (op->src[1]->type) {
2253
+ switch (src1->type) {
1072
2254
  case GGML_TYPE_F16:
1073
- return op->src[0]->type == GGML_TYPE_F16;
2255
+ supports_op |= (src0->type == GGML_TYPE_F16);
2256
+ break;
1074
2257
  case GGML_TYPE_F32:
1075
- switch (op->src[0]->type) {
2258
+ switch (src0->type) {
1076
2259
  case GGML_TYPE_F32:
1077
2260
  case GGML_TYPE_F16:
1078
2261
  case GGML_TYPE_Q4_0:
@@ -1094,17 +2277,67 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
1094
2277
  case GGML_TYPE_IQ1_M:
1095
2278
  case GGML_TYPE_IQ4_NL:
1096
2279
  case GGML_TYPE_IQ4_XS:
1097
- return true;
2280
+ supports_op = true;
2281
+ break;
1098
2282
  default:
1099
- return false;
2283
+ break;
1100
2284
  }
1101
2285
  default:
1102
- return false;
2286
+ break;
1103
2287
  }
2288
+ break;
2289
+ }
2290
+ case GGML_OP_RMS_NORM:
2291
+ supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
2292
+ break;
2293
+ case GGML_OP_ROPE:
2294
+ supports_op = op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
2295
+ break;
2296
+ case GGML_OP_GLU:
2297
+ switch (ggml_get_glu_op(op)) {
2298
+ case GGML_GLU_OP_REGLU:
2299
+ case GGML_GLU_OP_GEGLU:
2300
+ case GGML_GLU_OP_SWIGLU:
2301
+ case GGML_GLU_OP_GEGLU_ERF:
2302
+ case GGML_GLU_OP_GEGLU_QUICK:
2303
+ supports_op = op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
2304
+ break;
2305
+ case GGML_GLU_OP_SWIGLU_OAI:
2306
+ supports_op = op->type == GGML_TYPE_F32;
2307
+ break;
2308
+ default:
2309
+ break;
1104
2310
  }
2311
+ break;
2312
+ case GGML_OP_SCALE:
2313
+ supports_op = op->type == GGML_TYPE_F32;
2314
+ break;
2315
+ case GGML_OP_SOFT_MAX:
2316
+ supports_op = op->type == GGML_TYPE_F32;
2317
+ break;
1105
2318
  default:
1106
- return false;
2319
+ break;
1107
2320
  }
2321
+ if (ggml_nbytes(op) > webgpu_ctx->limits.maxStorageBufferBindingSize ||
2322
+ (src0 != nullptr && ggml_nbytes(src0) > webgpu_ctx->limits.maxStorageBufferBindingSize) ||
2323
+ (src1 != nullptr && ggml_nbytes(src1) > webgpu_ctx->limits.maxStorageBufferBindingSize) ||
2324
+ (src2 != nullptr && ggml_nbytes(src2) > webgpu_ctx->limits.maxStorageBufferBindingSize)) {
2325
+ supports_op = false;
2326
+ WEBGPU_LOG_DEBUG("ggml_webgpu op not supported due to size: ");
2327
+ }
2328
+
2329
+ if (!supports_op) {
2330
+ WEBGPU_LOG_DEBUG("ggml_webgpu op not supported: "
2331
+ << ggml_op_name(op->op) << " with types dst: " << ggml_type_name(op->type)
2332
+ << ", src0: " << (op->src[0] ? ggml_type_name(op->src[0]->type) : "null")
2333
+ << ", src1: " << (op->src[1] ? ggml_type_name(op->src[1]->type) : "null"));
2334
+ } else {
2335
+ WEBGPU_LOG_DEBUG("ggml_webgpu op supported: "
2336
+ << ggml_op_name(op->op) << " with types dst: " << ggml_type_name(op->type)
2337
+ << ", src0: " << (op->src[0] ? ggml_type_name(op->src[0]->type) : "null")
2338
+ << ", src1: " << (op->src[1] ? ggml_type_name(op->src[1]->type) : "null"));
2339
+ }
2340
+ return supports_op;
1108
2341
  }
1109
2342
 
1110
2343
  static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
@@ -1145,33 +2378,92 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
1145
2378
  GGML_ASSERT(index == 0);
1146
2379
  WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
1147
2380
 
2381
+ WEBGPU_CPU_PROFILE_TOTAL_START(reg_get_device);
2382
+
1148
2383
  ggml_backend_webgpu_reg_context * reg_ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
1149
2384
 
1150
2385
  webgpu_context ctx = reg_ctx->webgpu_ctx;
1151
2386
 
1152
- wgpu::RequestAdapterOptions options = {};
1153
- auto callback =
1154
- [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message, void * userdata) {
1155
- if (status != wgpu::RequestAdapterStatus::Success) {
1156
- GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
1157
- return;
1158
- }
1159
- *static_cast<wgpu::Adapter *>(userdata) = std::move(adapter);
1160
- };
1161
- void * userdata = &ctx->adapter;
1162
- ctx->instance.WaitAny(
1163
- ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::AllowSpontaneous, callback, userdata), UINT64_MAX);
2387
+ // TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
2388
+ const char * const adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
2389
+ wgpu::DawnTogglesDescriptor adapterTogglesDesc;
2390
+ adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
2391
+ adapterTogglesDesc.enabledToggleCount = 2;
2392
+ wgpu::RequestAdapterOptions options = {};
2393
+ options.nextInChain = &adapterTogglesDesc;
2394
+ ctx->instance.WaitAny(ctx->instance.RequestAdapter(
2395
+ &options, wgpu::CallbackMode::AllowSpontaneous,
2396
+ [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
2397
+ if (status != wgpu::RequestAdapterStatus::Success) {
2398
+ GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
2399
+ return;
2400
+ }
2401
+ ctx->adapter = std::move(adapter);
2402
+ }),
2403
+ UINT64_MAX);
1164
2404
  GGML_ASSERT(ctx->adapter != nullptr);
1165
2405
 
1166
2406
  ctx->adapter.GetLimits(&ctx->limits);
2407
+ ctx->max_wg_size_x = 288; // default value
1167
2408
 
1168
- wgpu::AdapterInfo info{};
2409
+ wgpu::AdapterInfo info{};
2410
+ wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
2411
+ if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
2412
+ info.nextInChain = &subgroup_matrix_configs;
2413
+ }
1169
2414
  ctx->adapter.GetInfo(&info);
1170
2415
 
2416
+ wgpu::SupportedFeatures features;
2417
+ ctx->adapter.GetFeatures(&features);
2418
+ // we require f16 support
2419
+ GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
2420
+
2421
+ // Only support square f16 matrices of size 8 or 16 for now
2422
+ bool valid_subgroup_matrix_config = false;
2423
+ if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
2424
+ for (size_t i = 0; i < subgroup_matrix_configs.configCount; i++) {
2425
+ const wgpu::SubgroupMatrixConfig config = subgroup_matrix_configs.configs[i];
2426
+ if (config.M == config.N && config.N == config.K && (config.K == 8 || config.K == 16) &&
2427
+ config.componentType == wgpu::SubgroupMatrixComponentType::F16 &&
2428
+ config.resultComponentType == wgpu::SubgroupMatrixComponentType::F16) {
2429
+ ctx->subgroup_matrix_config = config;
2430
+ valid_subgroup_matrix_config = true;
2431
+ break;
2432
+ }
2433
+ }
2434
+ }
2435
+
2436
+ // For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
2437
+ // Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
2438
+ ctx->subgroup_size = info.subgroupMaxSize;
2439
+ ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
2440
+
1171
2441
  // Initialize device
1172
2442
  std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
1173
2443
  wgpu::FeatureName::ImplicitDeviceSynchronization };
1174
- wgpu::DeviceDescriptor dev_desc;
2444
+ if (ctx->supports_subgroup_matrix) {
2445
+ required_features.push_back(wgpu::FeatureName::Subgroups);
2446
+ required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
2447
+ }
2448
+
2449
+ #ifdef GGML_WEBGPU_GPU_PROFILE
2450
+ required_features.push_back(wgpu::FeatureName::TimestampQuery);
2451
+ #endif
2452
+
2453
+ // Enable Dawn-specific toggles to increase native performance
2454
+ // TODO: Don't enable for WASM builds, they won't have an effect anyways
2455
+ // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
2456
+ // only for native performance?
2457
+ const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init",
2458
+ "disable_polyfills_on_integer_div_and_mod" };
2459
+ const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
2460
+ wgpu::DawnTogglesDescriptor deviceTogglesDesc;
2461
+ deviceTogglesDesc.enabledToggles = deviceEnabledToggles;
2462
+ deviceTogglesDesc.enabledToggleCount = 4;
2463
+ deviceTogglesDesc.disabledToggles = deviceDisabledToggles;
2464
+ deviceTogglesDesc.disabledToggleCount = 1;
2465
+
2466
+ wgpu::DeviceDescriptor dev_desc;
1175
2467
  dev_desc.requiredLimits = &ctx->limits;
1176
2468
  dev_desc.requiredFeatures = required_features.data();
1177
2469
  dev_desc.requiredFeatureCount = required_features.size();
@@ -1179,21 +2471,22 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
1179
2471
  wgpu::CallbackMode::AllowSpontaneous,
1180
2472
  [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
1181
2473
  GGML_UNUSED(device);
1182
- GGML_LOG_ERROR(
1183
- "ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), std::string(message).c_str());
2474
+ GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason),
2475
+ std::string(message).c_str());
1184
2476
  });
1185
2477
  dev_desc.SetUncapturedErrorCallback(
1186
2478
  [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
1187
2479
  GGML_UNUSED(device);
1188
- GGML_LOG_ERROR(
1189
- "ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), std::string(message).c_str());
2480
+ GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
2481
+ std::string(message).c_str());
1190
2482
  });
2483
+ dev_desc.nextInChain = &deviceTogglesDesc;
1191
2484
  ctx->instance.WaitAny(ctx->adapter.RequestDevice(
1192
- &dev_desc,
1193
- wgpu::CallbackMode::AllowSpontaneous,
2485
+ &dev_desc, wgpu::CallbackMode::AllowSpontaneous,
1194
2486
  [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
1195
2487
  if (status != wgpu::RequestDeviceStatus::Success) {
1196
- GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", std::string(message).c_str());
2488
+ GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n",
2489
+ std::string(message).c_str());
1197
2490
  return;
1198
2491
  }
1199
2492
  ctx->device = std::move(device);
@@ -1205,34 +2498,43 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
1205
2498
  ctx->queue = ctx->device.GetQueue();
1206
2499
 
1207
2500
  // Create buffer pool for shader parameters
1208
- ctx->param_buf_pool.init(ctx->device,
1209
- WEBGPU_NUM_PARAM_BUFS,
1210
- WEBGPU_PARAMS_BUF_SIZE_BYTES,
2501
+ ctx->param_buf_pool.init(ctx->device, WEBGPU_NUM_PARAM_BUFS, WEBGPU_PARAMS_BUF_SIZE_BYTES,
1211
2502
  wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
1212
2503
  wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
1213
- ctx->set_rows_error_buf_pool.init(ctx->device,
1214
- WEBGPU_NUM_SET_ROWS_ERROR_BUFS,
1215
- WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
2504
+
2505
+ #ifdef GGML_WEBGPU_GPU_PROFILE
2506
+ // Initialize buffer pool for timestamp queries (profiling)
2507
+ ctx->timestamp_query_buf_pool.init(ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS,
2508
+ WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
2509
+ wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc,
2510
+ wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst);
2511
+ #endif
2512
+
2513
+ ctx->set_rows_error_buf_pool.init(ctx->device, WEBGPU_NUM_SET_ROWS_ERROR_BUFS, WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
1216
2514
  wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage,
1217
2515
  wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);
1218
2516
 
1219
2517
  ggml_webgpu_init_memset_pipeline(ctx);
1220
2518
  ggml_webgpu_init_mul_mat_pipeline(ctx);
1221
2519
  ggml_webgpu_init_set_rows_pipeline(ctx);
2520
+ ggml_webgpu_init_get_rows_pipeline(ctx);
1222
2521
  ggml_webgpu_init_cpy_pipeline(ctx);
2522
+ ggml_webgpu_init_add_pipeline(ctx);
2523
+ ggml_webgpu_init_sub_pipeline(ctx);
2524
+ ggml_webgpu_init_mul_pipeline(ctx);
2525
+ ggml_webgpu_init_div_pipeline(ctx);
2526
+ ggml_webgpu_init_rms_norm_pipeline(ctx);
2527
+ ggml_webgpu_init_rope_pipeline(ctx);
2528
+ ggml_webgpu_init_glu_pipeline(ctx);
2529
+ ggml_webgpu_init_scale_pipeline(ctx);
2530
+ ggml_webgpu_init_soft_max_pipeline(ctx);
1223
2531
 
1224
2532
  #ifdef GGML_WEBGPU_DEBUG
1225
2533
  // Initialize debug buffers
1226
- ggml_webgpu_create_buffer(ctx->device,
1227
- ctx->debug_host_buf,
1228
- WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
1229
- wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
1230
- "debug_host_buf");
1231
- ggml_webgpu_create_buffer(ctx->device,
1232
- ctx->debug_dev_buf,
1233
- WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
1234
- wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc,
1235
- "debug_dev_buf");
2534
+ ggml_webgpu_create_buffer(ctx->device, ctx->debug_host_buf, WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
2535
+ wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "debug_host_buf");
2536
+ ggml_webgpu_create_buffer(ctx->device, ctx->debug_dev_buf, WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
2537
+ wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, "debug_dev_buf");
1236
2538
  #endif
1237
2539
 
1238
2540
  static ggml_backend_webgpu_device_context device_ctx;
@@ -1243,12 +2545,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
1243
2545
  GGML_LOG_INFO(
1244
2546
  "ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | "
1245
2547
  "device_desc: %s\n",
1246
- info.vendorID,
1247
- std::string(info.vendor).c_str(),
1248
- std::string(info.architecture).c_str(),
1249
- info.deviceID,
1250
- std::string(info.device).c_str(),
1251
- std::string(info.description).c_str());
2548
+ info.vendorID, std::string(info.vendor).c_str(), std::string(info.architecture).c_str(), info.deviceID,
2549
+ std::string(info.device).c_str(), std::string(info.description).c_str());
1252
2550
 
1253
2551
  // See GGML Backend Device Interface section
1254
2552
  static ggml_backend_device device = {
@@ -1256,6 +2554,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
1256
2554
  /* .reg = */ reg,
1257
2555
  /* .context = */ &device_ctx,
1258
2556
  };
2557
+
2558
+ WEBGPU_CPU_PROFILE_TOTAL_END(reg_get_device, ctx);
1259
2559
  return &device;
1260
2560
  }
1261
2561
 
@@ -1278,11 +2578,18 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
1278
2578
  ctx.name = GGML_WEBGPU_NAME;
1279
2579
  ctx.device_count = 1;
1280
2580
 
2581
+ const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" };
2582
+
2583
+ wgpu::DawnTogglesDescriptor instanceTogglesDesc;
2584
+ instanceTogglesDesc.enabledToggles = instanceEnabledToggles;
2585
+ instanceTogglesDesc.enabledToggleCount = 1;
1281
2586
  wgpu::InstanceDescriptor instance_descriptor{};
1282
2587
  std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
1283
2588
  instance_descriptor.requiredFeatures = instance_features.data();
1284
2589
  instance_descriptor.requiredFeatureCount = instance_features.size();
1285
- webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
2590
+ instance_descriptor.nextInChain = &instanceTogglesDesc;
2591
+
2592
+ webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
1286
2593
  GGML_ASSERT(webgpu_ctx->instance != nullptr);
1287
2594
 
1288
2595
  static ggml_backend_reg reg = {