@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -78,7 +78,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
78
78
  #define VK_VENDOR_ID_INTEL 0x8086
79
79
  #define VK_VENDOR_ID_NVIDIA 0x10de
80
80
 
81
- #define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32
81
+ #define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
82
82
 
83
83
  #define GGML_VK_MAX_NODES 8192
84
84
 
@@ -102,25 +102,11 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
102
102
 
103
103
  struct ggml_backend_vk_context;
104
104
 
105
- struct vk_queue {
106
- uint32_t queue_family_index;
107
- vk::Queue queue;
108
- vk::CommandPool pool;
109
- uint32_t cmd_buffer_idx;
110
- std::vector<vk::CommandBuffer> cmd_buffers;
111
-
112
- vk::PipelineStageFlags stage_flags;
113
-
114
- bool transfer_only;
115
- };
105
+ #define MAX_PARAMETER_COUNT 8
116
106
 
117
107
  struct vk_pipeline_struct {
118
108
  std::string name;
119
109
  vk::ShaderModule shader_module;
120
- vk::DescriptorSetLayout dsl;
121
- std::vector<vk::DescriptorPool> descriptor_pools;
122
- std::vector<vk::DescriptorSet> descriptor_sets;
123
- uint32_t descriptor_set_idx;
124
110
  vk::PipelineLayout layout;
125
111
  vk::Pipeline pipeline;
126
112
  uint32_t push_constant_size;
@@ -167,6 +153,45 @@ struct ggml_backend_vk_buffer_type_context {
167
153
  vk_device device;
168
154
  };
169
155
 
156
+ struct vk_queue;
157
+
158
+ // Stores command pool/buffers. There's an instance of this
159
+ // for each (context,queue) pair and for each (device,queue) pair.
160
+ struct vk_command_pool {
161
+ void init(vk_device& device, vk_queue *q_);
162
+ void destroy(vk::Device& device);
163
+
164
+ vk::CommandPool pool;
165
+ uint32_t cmd_buffer_idx;
166
+ std::vector<vk::CommandBuffer> cmd_buffers;
167
+
168
+ vk_queue *q;
169
+ };
170
+
171
+ // Prevent simultaneous submissions to the same queue.
172
+ // This could be per vk_queue if we stopped having two vk_queue structures
173
+ // sharing the same vk::Queue.
174
+ static std::mutex queue_mutex;
175
+
176
+ struct vk_queue {
177
+ uint32_t queue_family_index;
178
+ vk::Queue queue;
179
+
180
+ vk_command_pool cmd_pool;
181
+
182
+ vk::PipelineStageFlags stage_flags;
183
+
184
+ bool transfer_only;
185
+
186
+ // copy everything except the cmd_pool
187
+ void copyFrom(vk_queue &other) {
188
+ queue_family_index = other.queue_family_index;
189
+ queue = other.queue;
190
+ stage_flags = other.stage_flags;
191
+ transfer_only = other.transfer_only;
192
+ }
193
+ };
194
+
170
195
  static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
171
196
  static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
172
197
  static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
@@ -196,6 +221,7 @@ enum vk_device_architecture {
196
221
  AMD_RDNA1,
197
222
  AMD_RDNA2,
198
223
  AMD_RDNA3,
224
+ INTEL_XE2,
199
225
  };
200
226
 
201
227
  static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
@@ -246,6 +272,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
246
272
  }
247
273
  return vk_device_architecture::AMD_RDNA2;
248
274
  }
275
+ } else if (props.vendorID == VK_VENDOR_ID_INTEL) {
276
+ const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
277
+
278
+ bool subgroup_size_control = false;
279
+
280
+ for (const auto& properties : ext_props) {
281
+ if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
282
+ subgroup_size_control = true;
283
+ }
284
+ }
285
+
286
+ if (!subgroup_size_control) {
287
+ return vk_device_architecture::OTHER;
288
+ }
289
+
290
+ vk::PhysicalDeviceProperties2 props2;
291
+ vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
292
+
293
+ props2.pNext = &subgroup_size_control_props;
294
+ device.getProperties2(&props2);
295
+
296
+ if (subgroup_size_control_props.minSubgroupSize == 16) {
297
+ // Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8.
298
+ // Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value.
299
+ // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
300
+ // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
301
+ return vk_device_architecture::INTEL_XE2;
302
+ }
249
303
  }
250
304
  return vk_device_architecture::OTHER;
251
305
  }
@@ -312,6 +366,8 @@ struct vk_device_struct {
312
366
  // set to true to indicate that some shaders need to be compiled after the dryrun
313
367
  bool need_compiles {};
314
368
 
369
+ vk::DescriptorSetLayout dsl;
370
+
315
371
  vk_matmul_pipeline pipeline_matmul_f32 {};
316
372
  vk_matmul_pipeline pipeline_matmul_f32_f16 {};
317
373
  vk_matmul_pipeline pipeline_matmul_bf16 {};
@@ -396,6 +452,7 @@ struct vk_device_struct {
396
452
  vk_pipeline pipeline_count_equal_i32;
397
453
  vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
398
454
  vk_pipeline pipeline_timestep_embedding_f32;
455
+ vk_pipeline pipeline_conv_transpose_1d_f32;
399
456
  vk_pipeline pipeline_pool2d_f32;
400
457
  vk_pipeline pipeline_rwkv_wkv6_f32;
401
458
  vk_pipeline pipeline_rwkv_wkv7_f32;
@@ -428,7 +485,6 @@ struct vk_device_struct {
428
485
  vk_pipeline pipeline_flash_attn_split_k_reduce;
429
486
 
430
487
  std::unordered_map<std::string, vk_pipeline_ref> pipelines;
431
- std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
432
488
 
433
489
  std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
434
490
 
@@ -444,7 +500,7 @@ struct vk_device_struct {
444
500
  // for GGML_VK_PERF_LOGGER
445
501
  std::unique_ptr<vk_perf_logger> perf_logger;
446
502
  vk::QueryPool query_pool;
447
- uint32_t num_queries;
503
+ int32_t num_queries;
448
504
 
449
505
  ~vk_device_struct() {
450
506
  VK_LOG_DEBUG("destroy device " << name);
@@ -453,10 +509,8 @@ struct vk_device_struct {
453
509
 
454
510
  ggml_vk_destroy_buffer(sync_staging);
455
511
 
456
- device.destroyCommandPool(compute_queue.pool);
457
- if (!single_queue) {
458
- device.destroyCommandPool(transfer_queue.pool);
459
- }
512
+ compute_queue.cmd_pool.destroy(device);
513
+ transfer_queue.cmd_pool.destroy(device);
460
514
 
461
515
  for (auto& pipeline : pipelines) {
462
516
  if (pipeline.second.expired()) {
@@ -468,10 +522,26 @@ struct vk_device_struct {
468
522
  }
469
523
  pipelines.clear();
470
524
 
525
+ device.destroyDescriptorSetLayout(dsl);
526
+
471
527
  device.destroy();
472
528
  }
473
529
  };
474
530
 
531
+ void vk_command_pool::init(vk_device& device, vk_queue *q_) {
532
+ cmd_buffer_idx = 0;
533
+ q = q_;
534
+
535
+ vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
536
+ pool = device->device.createCommandPool(command_pool_create_info);
537
+ }
538
+
539
+ void vk_command_pool::destroy(vk::Device& device) {
540
+ device.destroyCommandPool(pool);
541
+ pool = nullptr;
542
+ cmd_buffers.clear();
543
+ }
544
+
475
545
  struct vk_buffer_struct {
476
546
  vk::Buffer buffer = VK_NULL_HANDLE;
477
547
  vk::DeviceMemory device_memory = VK_NULL_HANDLE;
@@ -706,6 +776,21 @@ struct vk_op_timestep_embedding_push_constants {
706
776
  uint32_t max_period;
707
777
  };
708
778
 
779
+ struct vk_op_conv_transpose_1d_push_constants {
780
+ uint32_t Cout;
781
+ uint32_t Cin;
782
+ uint32_t K;
783
+ uint32_t L;
784
+ uint32_t KL;
785
+
786
+ uint32_t nb01;
787
+ uint32_t nb02;
788
+ uint32_t nb11;
789
+ uint32_t nb1;
790
+
791
+ int32_t s0;
792
+ };
793
+
709
794
  struct vk_op_pool2d_push_constants {
710
795
  uint32_t IW; uint32_t IH;
711
796
  uint32_t OW; uint32_t OH;
@@ -774,7 +859,7 @@ struct vk_context_struct {
774
859
  std::vector<vk_staging_memcpy> in_memcpys;
775
860
  std::vector<vk_staging_memcpy> out_memcpys;
776
861
 
777
- vk_queue * q;
862
+ vk_command_pool * p {};
778
863
  };
779
864
  typedef std::shared_ptr<vk_context_struct> vk_context;
780
865
  typedef std::weak_ptr<vk_context_struct> vk_context_ref;
@@ -885,6 +970,14 @@ struct ggml_backend_vk_context {
885
970
  vk_context_ref transfer_ctx;
886
971
 
887
972
  std::vector<vk_context_ref> tensor_ctxs;
973
+
974
+ std::vector<vk::DescriptorPool> descriptor_pools;
975
+ std::vector<vk::DescriptorSet> descriptor_sets;
976
+ uint32_t descriptor_set_idx {};
977
+ uint32_t pipeline_descriptor_set_requirements {};
978
+
979
+ vk_command_pool compute_cmd_pool;
980
+ vk_command_pool transfer_cmd_pool;
888
981
  };
889
982
 
890
983
  static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
@@ -948,6 +1041,14 @@ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
948
1041
  struct vk_instance_t {
949
1042
  vk::Instance instance;
950
1043
 
1044
+ bool debug_utils_support = false; // VK_EXT_debug_utils enabled
1045
+ PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {};
1046
+ PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {};
1047
+ PFN_vkQueueEndDebugUtilsLabelEXT pfn_vkQueueEndDebugUtilsLabelEXT = {};
1048
+ PFN_vkCmdBeginDebugUtilsLabelEXT pfn_vkCmdBeginDebugUtilsLabelEXT = {};
1049
+ PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {};
1050
+ PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {};
1051
+
951
1052
  std::vector<size_t> device_indices;
952
1053
  vk_device devices[GGML_VK_MAX_DEVICES];
953
1054
  };
@@ -1015,39 +1116,19 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
1015
1116
  ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
1016
1117
  disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
1017
1118
  GGML_ASSERT(parameter_count > 0);
1119
+ GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT);
1018
1120
  GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
1019
1121
 
1020
1122
  vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
1021
1123
  pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
1022
1124
 
1023
- std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
1024
- std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
1025
- for (uint32_t i = 0; i < parameter_count; i++) {
1026
- dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
1027
- dsl_binding_flags.push_back({});
1028
- }
1029
-
1030
- vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
1031
-
1032
1125
  vk::PushConstantRange pcr(
1033
1126
  vk::ShaderStageFlagBits::eCompute,
1034
1127
  0,
1035
1128
  pipeline->push_constant_size
1036
1129
  );
1037
1130
 
1038
- vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
1039
- {},
1040
- dsl_binding);
1041
- descriptor_set_layout_create_info.setPNext(&dslbfci);
1042
- pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
1043
-
1044
- vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
1045
- vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
1046
- pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
1047
-
1048
- pipeline->descriptor_set_idx = 0;
1049
-
1050
- vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
1131
+ vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr);
1051
1132
  pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
1052
1133
 
1053
1134
  std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
@@ -1107,6 +1188,14 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
1107
1188
  }
1108
1189
  pipeline->compiled = true;
1109
1190
 
1191
+ if (vk_instance.debug_utils_support) {
1192
+ vk::DebugUtilsObjectNameInfoEXT duoni;
1193
+ duoni.objectType = vk::ObjectType::ePipeline;
1194
+ duoni.pObjectName = pipeline->name.c_str();
1195
+ duoni.objectHandle = reinterpret_cast<uint64_t>(static_cast<VkPipeline_T*>(pipeline->pipeline));
1196
+ vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast<VkDebugUtilsObjectNameInfoEXT &>(duoni));
1197
+ }
1198
+
1110
1199
  {
1111
1200
  std::lock_guard<std::mutex> guard(device->mutex);
1112
1201
  device->pipelines.insert({ pipeline->name, pipeline });
@@ -1122,15 +1211,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
1122
1211
 
1123
1212
  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
1124
1213
  VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
1125
- for (auto& pool : pipeline->descriptor_pools) {
1126
- device.destroyDescriptorPool(pool);
1127
- }
1128
- pipeline->descriptor_pools.clear();
1129
- pipeline->descriptor_sets.clear();
1130
- pipeline->descriptor_set_idx = 0;
1131
-
1132
- device.destroyDescriptorSetLayout(pipeline->dsl);
1133
-
1134
1214
  device.destroyPipelineLayout(pipeline->layout);
1135
1215
 
1136
1216
  device.destroyShaderModule(pipeline->shader_module);
@@ -1138,97 +1218,77 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
1138
1218
  device.destroyPipeline(pipeline->pipeline);
1139
1219
  }
1140
1220
 
1141
- static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
1221
+ static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
1142
1222
  VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
1143
- device->pipeline_descriptor_set_requirements[pipeline->name] += n;
1223
+ ctx->pipeline_descriptor_set_requirements += n;
1144
1224
  if (!pipeline->compiled) {
1145
1225
  pipeline->needed = true;
1146
- device->need_compiles = true;
1226
+ ctx->device->need_compiles = true;
1147
1227
  }
1148
1228
  }
1149
1229
 
1150
- static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
1151
- std::lock_guard<std::mutex> guard(device->mutex);
1152
-
1153
- for (auto& pair : device->pipeline_descriptor_set_requirements) {
1154
- vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
1155
- const uint64_t n = pair.second;
1156
-
1157
- VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
1230
+ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
1158
1231
 
1159
- if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
1160
- // Enough descriptors are available
1161
- continue;
1162
- }
1232
+ if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
1233
+ // Enough descriptors are available
1234
+ return;
1235
+ }
1163
1236
 
1164
- uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
1165
- uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1166
- uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1237
+ vk_device& device = ctx->device;
1167
1238
 
1168
- while (to_alloc > 0) {
1169
- const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
1170
- to_alloc -= alloc_count;
1171
- pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1239
+ uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
1240
+ uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1241
+ uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1172
1242
 
1173
- if (pool_idx >= pipeline->descriptor_pools.size()) {
1174
- vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
1175
- vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
1176
- pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
1177
- }
1243
+ while (to_alloc > 0) {
1244
+ const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
1245
+ to_alloc -= alloc_count;
1246
+ pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1178
1247
 
1179
- std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
1180
- for (uint32_t i = 0; i < alloc_count; i++) {
1181
- layouts[i] = pipeline->dsl;
1182
- }
1183
- vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data());
1184
- std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
1185
- pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
1248
+ if (pool_idx >= ctx->descriptor_pools.size()) {
1249
+ vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
1250
+ vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
1251
+ ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
1252
+ }
1186
1253
 
1187
- pool_idx++;
1254
+ std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
1255
+ for (uint32_t i = 0; i < alloc_count; i++) {
1256
+ layouts[i] = device->dsl;
1188
1257
  }
1189
- }
1190
- }
1258
+ vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
1259
+ std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
1260
+ ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
1191
1261
 
1192
- static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
1193
- VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
1194
- pipeline->descriptor_set_idx = 0;
1262
+ pool_idx++;
1263
+ }
1195
1264
  }
1196
1265
 
1197
- static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_queue& q) {
1266
+ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
1198
1267
  VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
1199
- std::lock_guard<std::mutex> guard(device->mutex);
1200
1268
 
1201
- if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
1269
+ if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
1202
1270
  // Reuse command buffer
1203
- return q.cmd_buffers[q.cmd_buffer_idx++];
1271
+ return p.cmd_buffers[p.cmd_buffer_idx++];
1204
1272
  }
1205
1273
 
1206
1274
  vk::CommandBufferAllocateInfo command_buffer_alloc_info(
1207
- q.pool,
1275
+ p.pool,
1208
1276
  vk::CommandBufferLevel::ePrimary,
1209
1277
  1);
1210
1278
  const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
1211
1279
  auto buf = cmd_buffers.front();
1212
1280
 
1213
- q.cmd_buffers.push_back(buf);
1214
- q.cmd_buffer_idx++;
1281
+ p.cmd_buffers.push_back(buf);
1282
+ p.cmd_buffer_idx++;
1215
1283
 
1216
1284
  return buf;
1217
1285
  }
1218
1286
 
1219
- static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
1220
- VK_LOG_DEBUG("ggml_vk_create_submission()");
1221
- vk_submission s;
1222
- s.buffer = ggml_vk_create_cmd_buffer(device, q);
1223
- s.wait_semaphores = std::move(wait_semaphores);
1224
- s.signal_semaphores = std::move(signal_semaphores);
1225
- return s;
1226
- }
1227
-
1228
1287
  static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
1229
1288
  if (ctx->seqs.empty()) {
1230
1289
  if (fence) {
1231
- ctx->q->queue.submit({}, fence);
1290
+ std::lock_guard<std::mutex> guard(queue_mutex);
1291
+ ctx->p->q->queue.submit({}, fence);
1232
1292
  }
1233
1293
  return;
1234
1294
  }
@@ -1267,7 +1327,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
1267
1327
  tl_signal_vals.push_back({});
1268
1328
  tl_signal_semaphores.push_back({});
1269
1329
  for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
1270
- stage_flags[idx].push_back(ctx->q->stage_flags);
1330
+ stage_flags[idx].push_back(ctx->p->q->stage_flags);
1271
1331
  tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
1272
1332
  tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
1273
1333
  }
@@ -1297,7 +1357,8 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
1297
1357
  }
1298
1358
  }
1299
1359
 
1300
- ctx->q->queue.submit(submit_infos, fence);
1360
+ std::lock_guard<std::mutex> guard(queue_mutex);
1361
+ ctx->p->q->queue.submit(submit_infos, fence);
1301
1362
 
1302
1363
  ctx->seqs.clear();
1303
1364
  }
@@ -1355,28 +1416,25 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
1355
1416
  q.queue_family_index = queue_family_index;
1356
1417
  q.transfer_only = transfer_only;
1357
1418
 
1358
- vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
1359
- q.pool = device->device.createCommandPool(command_pool_create_info_compute);
1360
-
1361
- q.cmd_buffer_idx = 0;
1419
+ q.cmd_pool.init(device, &q);
1362
1420
 
1363
1421
  q.queue = device->device.getQueue(queue_family_index, queue_index);
1364
1422
 
1365
1423
  q.stage_flags = stage_flags;
1366
1424
  }
1367
1425
 
1368
- static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
1426
+ static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
1369
1427
  vk_context result = std::make_shared<vk_context_struct>();
1370
1428
  VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
1371
1429
  ctx->gc.contexts.emplace_back(result);
1372
- result->q = &q;
1430
+ result->p = &p;
1373
1431
  return result;
1374
1432
  }
1375
1433
 
1376
- static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
1434
+ static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
1377
1435
  vk_context result = std::make_shared<vk_context_struct>();
1378
1436
  VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
1379
- result->q = &q;
1437
+ result->p = &p;
1380
1438
  return result;
1381
1439
  }
1382
1440
 
@@ -1409,15 +1467,29 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
1409
1467
  return ctx->gc.events[ctx->event_idx++];
1410
1468
  }
1411
1469
 
1412
- static void ggml_vk_queue_cleanup(vk_device& device, vk_queue& q) {
1413
- VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
1414
- std::lock_guard<std::mutex> guard(device->mutex);
1470
+ static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
1471
+ VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
1415
1472
 
1416
1473
  // Requires command buffers to be done
1417
- device->device.resetCommandPool(q.pool);
1418
- q.cmd_buffer_idx = 0;
1474
+ device->device.resetCommandPool(p.pool);
1475
+ p.cmd_buffer_idx = 0;
1419
1476
  }
1420
1477
 
1478
+ static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
1479
+ VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
1480
+
1481
+ // Arbitrary frequency to cleanup/reuse command buffers
1482
+ static constexpr uint32_t cleanup_frequency = 10;
1483
+
1484
+ if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
1485
+ ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
1486
+ }
1487
+ if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
1488
+ ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
1489
+ }
1490
+ }
1491
+
1492
+
1421
1493
  static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
1422
1494
  for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
1423
1495
  vk::MemoryType memory_type = mem_props->memoryTypes[i];
@@ -1436,8 +1508,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
1436
1508
  throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
1437
1509
  }
1438
1510
 
1439
- std::lock_guard<std::mutex> guard(device->mutex);
1440
-
1441
1511
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
1442
1512
 
1443
1513
  if (size == 0) {
@@ -1566,11 +1636,11 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
1566
1636
  static void ggml_vk_sync_buffers(vk_context& ctx) {
1567
1637
  VK_LOG_DEBUG("ggml_vk_sync_buffers()");
1568
1638
 
1569
- const bool transfer_queue = ctx->q->transfer_only;
1639
+ const bool transfer_queue = ctx->p->q->transfer_only;
1570
1640
 
1571
1641
  ctx->s->buffer.pipelineBarrier(
1572
- ctx->q->stage_flags,
1573
- ctx->q->stage_flags,
1642
+ ctx->p->q->stage_flags,
1643
+ ctx->p->q->stage_flags,
1574
1644
  {},
1575
1645
  { {
1576
1646
  { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
@@ -1589,8 +1659,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
1589
1659
 
1590
1660
  ctx->s->buffer.waitEvents(
1591
1661
  events,
1592
- ctx->q->stage_flags,
1593
- ctx->q->stage_flags,
1662
+ ctx->p->q->stage_flags,
1663
+ ctx->p->q->stage_flags,
1594
1664
  {},
1595
1665
  {},
1596
1666
  {}
@@ -2726,6 +2796,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
2726
2796
 
2727
2797
  ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
2728
2798
 
2799
+ ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
2800
+
2729
2801
  ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
2730
2802
 
2731
2803
  ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
@@ -3322,6 +3394,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
3322
3394
  }
3323
3395
  }
3324
3396
 
3397
+
3398
+ std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
3399
+ std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
3400
+ for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) {
3401
+ dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
3402
+ dsl_binding_flags.push_back({});
3403
+ }
3404
+
3405
+ vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
3406
+
3407
+ vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
3408
+ {},
3409
+ dsl_binding);
3410
+ descriptor_set_layout_create_info.setPNext(&dslbfci);
3411
+ device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
3412
+
3325
3413
  ggml_vk_load_shaders(device);
3326
3414
 
3327
3415
  if (!device->single_queue) {
@@ -3329,7 +3417,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
3329
3417
  ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
3330
3418
  } else {
3331
3419
  // TODO: Use pointer or reference to avoid copy
3332
- device->transfer_queue = device->compute_queue;
3420
+ device->transfer_queue.copyFrom(device->compute_queue);
3421
+ device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
3333
3422
  }
3334
3423
 
3335
3424
  device->buffer_type = {
@@ -3488,6 +3577,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
3488
3577
  static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
3489
3578
  static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
3490
3579
 
3580
+ static bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
3581
+
3491
3582
  static void ggml_vk_instance_init() {
3492
3583
  if (vk_instance_initialized) {
3493
3584
  return;
@@ -3508,7 +3599,7 @@ static void ggml_vk_instance_init() {
3508
3599
  #ifdef __APPLE__
3509
3600
  const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
3510
3601
  #endif
3511
-
3602
+ const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr;
3512
3603
  std::vector<const char*> layers;
3513
3604
 
3514
3605
  if (validation_ext) {
@@ -3523,6 +3614,9 @@ static void ggml_vk_instance_init() {
3523
3614
  extensions.push_back("VK_KHR_portability_enumeration");
3524
3615
  }
3525
3616
  #endif
3617
+ if (debug_utils_ext) {
3618
+ extensions.push_back("VK_EXT_debug_utils");
3619
+ }
3526
3620
  vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
3527
3621
  #ifdef __APPLE__
3528
3622
  if (portability_enumeration_ext) {
@@ -3546,13 +3640,25 @@ static void ggml_vk_instance_init() {
3546
3640
  vk_instance.instance = vk::createInstance(instance_create_info);
3547
3641
  vk_instance_initialized = true;
3548
3642
 
3549
- vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
3643
+ if (debug_utils_ext) {
3644
+ vk_instance.debug_utils_support = true;
3645
+ vk_instance.pfn_vkSetDebugUtilsObjectNameEXT = (PFN_vkSetDebugUtilsObjectNameEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkSetDebugUtilsObjectNameEXT");
3646
+ vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT = (PFN_vkQueueBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueBeginDebugUtilsLabelEXT");
3647
+ vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT = (PFN_vkQueueEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueEndDebugUtilsLabelEXT");
3648
+ vk_instance.pfn_vkCmdBeginDebugUtilsLabelEXT = (PFN_vkCmdBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdBeginDebugUtilsLabelEXT");
3649
+ vk_instance.pfn_vkCmdEndDebugUtilsLabelEXT = (PFN_vkCmdEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdEndDebugUtilsLabelEXT");
3650
+ vk_instance.pfn_vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdInsertDebugUtilsLabelEXT");
3651
+
3652
+ }
3550
3653
 
3551
3654
  size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
3655
+ vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
3552
3656
 
3553
3657
  // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
3554
3658
  char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES");
3555
3659
  if (devices_env != nullptr) {
3660
+ size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
3661
+
3556
3662
  std::string devices(devices_env);
3557
3663
  std::replace(devices.begin(), devices.end(), ',', ' ');
3558
3664
 
@@ -3568,9 +3674,9 @@ static void ggml_vk_instance_init() {
3568
3674
  } else {
3569
3675
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
3570
3676
 
3571
- // Make sure at least one device exists
3677
+ // If no vulkan devices are found, return early
3572
3678
  if (devices.empty()) {
3573
- std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
3679
+ GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
3574
3680
  return;
3575
3681
  }
3576
3682
 
@@ -3653,9 +3759,20 @@ static void ggml_vk_instance_init() {
3653
3759
  }
3654
3760
  }
3655
3761
 
3656
- // If no dedicated GPUs found, fall back to GPU 0
3762
+ // If no dedicated GPUs found, fall back to the first non-CPU device.
3763
+ // If only CPU devices are available, return without devices.
3657
3764
  if (vk_instance.device_indices.empty()) {
3658
- vk_instance.device_indices.push_back(0);
3765
+ for (size_t i = 0; i < devices.size(); i++) {
3766
+ if (devices[i].getProperties().deviceType != vk::PhysicalDeviceType::eCpu) {
3767
+ vk_instance.device_indices.push_back(i);
3768
+ break;
3769
+ }
3770
+ }
3771
+ }
3772
+
3773
+ if (vk_instance.device_indices.empty()) {
3774
+ GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
3775
+ return;
3659
3776
  }
3660
3777
  }
3661
3778
  GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size());
@@ -3684,6 +3801,9 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
3684
3801
  ctx->fence = ctx->device->device.createFence({});
3685
3802
  ctx->almost_ready_fence = ctx->device->device.createFence({});
3686
3803
 
3804
+ ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
3805
+ ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
3806
+
3687
3807
  #ifdef GGML_VULKAN_CHECK_RESULTS
3688
3808
  const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
3689
3809
  vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -4049,9 +4169,9 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf
4049
4169
  }
4050
4170
  }
4051
4171
 
4052
- static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bool one_time = true) {
4172
+ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
4053
4173
  vk_submission s;
4054
- s.buffer = ggml_vk_create_cmd_buffer(device, q);
4174
+ s.buffer = ggml_vk_create_cmd_buffer(device, p);
4055
4175
  if (one_time) {
4056
4176
  s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
4057
4177
  } else {
@@ -4061,7 +4181,33 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
4061
4181
  return s;
4062
4182
  }
4063
4183
 
4064
- static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
4184
+ template <typename T> size_t push_constant_size(const T &t) {
4185
+ static_assert(std::is_class<T>::value, "T must be a struct/class");
4186
+ GGML_UNUSED(t);
4187
+ return sizeof(T);
4188
+ }
4189
+ template <typename T> size_t push_constant_size(const std::vector<T> &t) {
4190
+ GGML_UNUSED(t);
4191
+ return sizeof(T) * t.size();
4192
+ }
4193
+ template <typename T, uint32_t N> size_t push_constant_size(const std::array<T, N> &t) {
4194
+ GGML_UNUSED(t);
4195
+ return sizeof(T) * N;
4196
+ }
4197
+
4198
+ template <typename T> const T *push_constant_data(const T &t) {
4199
+ static_assert(std::is_class<T>::value, "T must be a struct/class");
4200
+ return &t;
4201
+ }
4202
+ template <typename T> const T *push_constant_data(const std::vector<T> &t) {
4203
+ return t.data();
4204
+ }
4205
+ template <typename T, uint32_t N> const T *push_constant_data(const std::array<T, N> &t) {
4206
+ return t.data();
4207
+ }
4208
+
4209
+ template <typename T>
4210
+ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, const T &push_constants, std::array<uint32_t, 3> elements) {
4065
4211
  const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
4066
4212
  const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
4067
4213
  const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
@@ -4070,14 +4216,14 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
4070
4216
  std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
4071
4217
  }
4072
4218
  std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
4073
- GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
4074
- GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
4219
+ GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
4220
+ GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
4075
4221
 
4076
- vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
4222
+ vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
4077
4223
  vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
4078
4224
  ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
4079
4225
 
4080
- subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
4226
+ subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
4081
4227
  subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
4082
4228
  subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
4083
4229
  pipeline->layout,
@@ -4110,7 +4256,7 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
4110
4256
  ggml_vk_ctx_end(subctx);
4111
4257
  }
4112
4258
 
4113
- subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->q) });
4259
+ subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
4114
4260
  subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
4115
4261
  }
4116
4262
 
@@ -4311,7 +4457,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
4311
4457
  memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
4312
4458
  }
4313
4459
  } else {
4314
- vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
4460
+ std::lock_guard<std::mutex> guard(dst->device->mutex);
4461
+
4462
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
4315
4463
  ggml_vk_ctx_begin(dst->device, subctx);
4316
4464
  ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
4317
4465
  ggml_vk_ctx_end(subctx);
@@ -4323,6 +4471,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
4323
4471
  ggml_vk_submit(subctx, dst->device->fence);
4324
4472
  VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
4325
4473
  dst->device->device.resetFences({ dst->device->fence });
4474
+ ggml_vk_queue_command_pools_cleanup(dst->device);
4326
4475
  }
4327
4476
  }
4328
4477
 
@@ -4399,7 +4548,9 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
4399
4548
 
4400
4549
  memcpy(dst, (uint8_t *) src->ptr + offset, size);
4401
4550
  } else {
4402
- vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
4551
+ std::lock_guard<std::mutex> guard(src->device->mutex);
4552
+
4553
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
4403
4554
  ggml_vk_ctx_begin(src->device, subctx);
4404
4555
  ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
4405
4556
  ggml_vk_ctx_end(subctx);
@@ -4407,6 +4558,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
4407
4558
  ggml_vk_submit(subctx, src->device->fence);
4408
4559
  VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
4409
4560
  src->device->device.resetFences({ src->device->fence });
4561
+ ggml_vk_queue_command_pools_cleanup(src->device);
4410
4562
 
4411
4563
  for (auto& cpy : subctx->out_memcpys) {
4412
4564
  memcpy(cpy.dst, cpy.src, cpy.n);
@@ -4426,15 +4578,17 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
4426
4578
 
4427
4579
  static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
4428
4580
  if (src->device == dst->device) {
4581
+ std::lock_guard<std::mutex> guard(src->device->mutex);
4429
4582
  VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
4430
4583
  // Copy within the device
4431
- vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
4584
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
4432
4585
  ggml_vk_ctx_begin(src->device, subctx);
4433
4586
  ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
4434
4587
  ggml_vk_ctx_end(subctx);
4435
4588
  ggml_vk_submit(subctx, src->device->fence);
4436
4589
  VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
4437
4590
  src->device->device.resetFences({ src->device->fence });
4591
+ ggml_vk_queue_command_pools_cleanup(src->device);
4438
4592
  } else {
4439
4593
  VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
4440
4594
  // Copy device to device
@@ -4459,7 +4613,8 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
4459
4613
  static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
4460
4614
  VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
4461
4615
 
4462
- vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
4616
+ std::lock_guard<std::mutex> guard(dst->device->mutex);
4617
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
4463
4618
  ggml_vk_ctx_begin(dst->device, subctx);
4464
4619
  subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
4465
4620
  ggml_vk_ctx_end(subctx);
@@ -4467,6 +4622,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
4467
4622
  ggml_vk_submit(subctx, dst->device->fence);
4468
4623
  VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
4469
4624
  dst->device->device.resetFences({ dst->device->fence });
4625
+ ggml_vk_queue_command_pools_cleanup(dst->device);
4470
4626
  }
4471
4627
 
4472
4628
  static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
@@ -4540,7 +4696,7 @@ static void ggml_vk_matmul(
4540
4696
  ggml_vk_sync_buffers(subctx);
4541
4697
  if (split_k == 1) {
4542
4698
  const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
4543
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch });
4699
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
4544
4700
  return;
4545
4701
  }
4546
4702
 
@@ -4548,10 +4704,10 @@ static void ggml_vk_matmul(
4548
4704
 
4549
4705
  const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
4550
4706
  // Make sure enough workgroups get assigned for split k to work
4551
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
4707
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
4552
4708
  ggml_vk_sync_buffers(subctx);
4553
4709
  const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
4554
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
4710
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
4555
4711
  }
4556
4712
 
4557
4713
  static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) {
@@ -4599,7 +4755,7 @@ static void ggml_vk_matmul_id(
4599
4755
  ggml_vk_sync_buffers(subctx);
4600
4756
  const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
4601
4757
  nei0, nei1, nbi1, ne11, padded_n };
4602
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as });
4758
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as });
4603
4759
  }
4604
4760
 
4605
4761
  static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
@@ -4720,7 +4876,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
4720
4876
  };
4721
4877
  init_pushconst_fastdiv(pc);
4722
4878
  ggml_vk_sync_buffers(subctx);
4723
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
4879
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
4724
4880
  }
4725
4881
 
4726
4882
  static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
@@ -4739,7 +4895,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
4739
4895
  vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
4740
4896
 
4741
4897
  ggml_vk_sync_buffers(subctx);
4742
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(uint32_t), &ne, { ne, 1, 1 });
4898
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 1>{ne}, { ne, 1, 1 });
4743
4899
  }
4744
4900
 
4745
4901
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -4880,18 +5036,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
4880
5036
  }
4881
5037
 
4882
5038
  // Request descriptor sets
4883
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
5039
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
4884
5040
  if (qx_needs_dequant) {
4885
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
5041
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
4886
5042
  }
4887
5043
  if (qy_needs_dequant) {
4888
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
5044
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
4889
5045
  }
4890
5046
  if (quantize_y) {
4891
- ggml_pipeline_request_descriptor_sets(ctx->device, to_q8_1, 1);
5047
+ ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
4892
5048
  }
4893
5049
  if (split_k > 1) {
4894
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
5050
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
4895
5051
  }
4896
5052
  return;
4897
5053
  }
@@ -4939,7 +5095,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
4939
5095
  } else if (qx_needs_dequant) {
4940
5096
  const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
4941
5097
  ggml_vk_sync_buffers(subctx);
4942
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
5098
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
4943
5099
  }
4944
5100
  if (y_non_contig) {
4945
5101
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -5073,12 +5229,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5073
5229
 
5074
5230
  // Request descriptor sets
5075
5231
  if (qx_needs_dequant) {
5076
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
5232
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
5077
5233
  }
5078
5234
  if (qy_needs_dequant) {
5079
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
5235
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
5080
5236
  }
5081
- ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
5237
+ ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
5082
5238
  return;
5083
5239
  }
5084
5240
 
@@ -5155,7 +5311,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5155
5311
  ggml_vk_sync_buffers(subctx);
5156
5312
  ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
5157
5313
  { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
5158
- sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
5314
+ pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
5159
5315
  }
5160
5316
 
5161
5317
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5211,7 +5367,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
5211
5367
 
5212
5368
  if (dryrun) {
5213
5369
  // Request descriptor sets
5214
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
5370
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
5215
5371
  return;
5216
5372
  }
5217
5373
 
@@ -5243,7 +5399,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
5243
5399
  }
5244
5400
 
5245
5401
  ggml_vk_sync_buffers(subctx);
5246
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, workgroups_z });
5402
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z });
5247
5403
  }
5248
5404
 
5249
5405
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5300,7 +5456,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
5300
5456
 
5301
5457
  if (dryrun) {
5302
5458
  // Request descriptor sets
5303
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
5459
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
5304
5460
  return;
5305
5461
  }
5306
5462
 
@@ -5326,7 +5482,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
5326
5482
  const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
5327
5483
  ggml_vk_sync_buffers(subctx);
5328
5484
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
5329
- { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
5485
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
5330
5486
  }
5331
5487
 
5332
5488
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5487,12 +5643,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
5487
5643
  }
5488
5644
 
5489
5645
  // Request descriptor sets
5490
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
5646
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
5491
5647
  if (qx_needs_dequant) {
5492
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
5648
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
5493
5649
  }
5494
5650
  if (qy_needs_dequant) {
5495
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
5651
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
5496
5652
  }
5497
5653
  return;
5498
5654
  }
@@ -5542,7 +5698,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
5542
5698
  const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
5543
5699
  ggml_vk_sync_buffers(subctx);
5544
5700
  ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
5545
- { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
5701
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
5546
5702
  }
5547
5703
  if (y_non_contig) {
5548
5704
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -5681,12 +5837,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
5681
5837
 
5682
5838
  // Request descriptor sets
5683
5839
  if (qx_needs_dequant) {
5684
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
5840
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
5685
5841
  }
5686
5842
  if (qy_needs_dequant) {
5687
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
5843
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
5688
5844
  }
5689
- ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
5845
+ ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
5690
5846
  return;
5691
5847
  }
5692
5848
 
@@ -5762,7 +5918,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
5762
5918
  ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
5763
5919
  { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
5764
5920
  vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
5765
- sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
5921
+ pc, { groups_x, (uint32_t)nei0, groups_z });
5766
5922
  }
5767
5923
 
5768
5924
  static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
@@ -6006,9 +6162,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6006
6162
 
6007
6163
  if (dryrun) {
6008
6164
  // Request descriptor sets
6009
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
6165
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
6010
6166
  if (split_k > 1) {
6011
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
6167
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
6012
6168
  }
6013
6169
  return;
6014
6170
  }
@@ -6112,7 +6268,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6112
6268
  // there's no more than one tile of rows (i.e. workgroups_x would have been
6113
6269
  // one). We reuse workgroups_x to mean the number of splits, so we need to
6114
6270
  // cancel out the divide by wg_denoms[0].
6115
- sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
6271
+ pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
6116
6272
 
6117
6273
  ggml_vk_sync_buffers(subctx);
6118
6274
  const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
@@ -6121,7 +6277,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6121
6277
  vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
6122
6278
  vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
6123
6279
  },
6124
- pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 });
6280
+ pc2, { (uint32_t)ne1, 1, 1 });
6125
6281
  } else {
6126
6282
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
6127
6283
  {
@@ -6131,7 +6287,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6131
6287
  vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
6132
6288
  vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
6133
6289
  },
6134
- sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
6290
+ pc, { workgroups_x, workgroups_y, workgroups_z });
6135
6291
  }
6136
6292
  }
6137
6293
 
@@ -6392,6 +6548,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
6392
6548
  return ctx->device->pipeline_timestep_embedding_f32;
6393
6549
  }
6394
6550
  return nullptr;
6551
+ case GGML_OP_CONV_TRANSPOSE_1D:
6552
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6553
+ return ctx->device->pipeline_conv_transpose_1d_f32;
6554
+ }
6555
+ return nullptr;
6395
6556
  case GGML_OP_POOL_2D:
6396
6557
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6397
6558
  return ctx->device->pipeline_pool2d_f32;
@@ -6566,7 +6727,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6566
6727
  }
6567
6728
 
6568
6729
  if (dryrun) {
6569
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
6730
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
6570
6731
  return;
6571
6732
  }
6572
6733
 
@@ -6726,6 +6887,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6726
6887
  uint32_t half_ceil = (dim + 1) / 2;
6727
6888
  elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
6728
6889
  } break;
6890
+ case GGML_OP_CONV_TRANSPOSE_1D:
6891
+ {
6892
+ elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
6893
+ } break;
6729
6894
  case GGML_OP_POOL_2D:
6730
6895
  {
6731
6896
  const uint32_t N = dst->ne[3];
@@ -6800,7 +6965,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6800
6965
  }
6801
6966
 
6802
6967
  ggml_vk_sync_buffers(subctx);
6803
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6968
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6804
6969
  } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
6805
6970
  // Empty src2 is possible in rope, but the shader needs a buffer
6806
6971
  vk_subbuffer subbuf_z;
@@ -6811,26 +6976,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6811
6976
  }
6812
6977
 
6813
6978
  ggml_vk_sync_buffers(subctx);
6814
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6979
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6815
6980
  } else if (op == GGML_OP_IM2COL) {
6816
6981
  // im2col uses only src1 and dst buffers
6817
6982
  ggml_vk_sync_buffers(subctx);
6818
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6983
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6819
6984
  } else if (op == GGML_OP_COUNT_EQUAL) {
6820
6985
  ggml_vk_sync_buffers(subctx);
6821
6986
  // count_equal assumes that destination buffer is initialized with zeroes
6822
6987
  ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
6823
6988
  ggml_vk_sync_buffers(subctx);
6824
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6989
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6825
6990
  } else if (use_src2) {
6826
6991
  ggml_vk_sync_buffers(subctx);
6827
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6992
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6828
6993
  } else if (use_src1) {
6829
6994
  ggml_vk_sync_buffers(subctx);
6830
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6995
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6831
6996
  } else {
6832
6997
  ggml_vk_sync_buffers(subctx);
6833
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6998
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6834
6999
  }
6835
7000
  }
6836
7001
 
@@ -6943,7 +7108,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
6943
7108
  GGML_ASSERT(pipeline != nullptr);
6944
7109
 
6945
7110
  if (dryrun) {
6946
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
7111
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
6947
7112
  return;
6948
7113
  }
6949
7114
 
@@ -6999,7 +7164,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
6999
7164
  vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
7000
7165
  vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
7001
7166
  vk_subbuffer{ d_D, dst_offset, dst_size }
7002
- }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements);
7167
+ }, pc, elements);
7003
7168
  } else if (version == 7) {
7004
7169
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
7005
7170
  vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
@@ -7010,7 +7175,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
7010
7175
  vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
7011
7176
  vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
7012
7177
  vk_subbuffer{ d_D, dst_offset, dst_size }
7013
- }, sizeof(vk_op_rwkv_wkv7_push_constants), &pc, elements);
7178
+ }, pc, elements);
7014
7179
  } else {
7015
7180
  // shouldn't happen
7016
7181
  GGML_ASSERT(false);
@@ -7082,7 +7247,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
7082
7247
  GGML_ASSERT(pipeline != nullptr);
7083
7248
 
7084
7249
  if (dryrun) {
7085
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
7250
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
7086
7251
  return;
7087
7252
  }
7088
7253
 
@@ -7147,7 +7312,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
7147
7312
  vk_subbuffer{ d_GM, gm_offset, gm_size },
7148
7313
  vk_subbuffer{ d_GV, gv_offset, gv_size },
7149
7314
  vk_subbuffer{ d_P, p_offset, p_size },
7150
- }, sizeof(vk_op_push_constants), &pc, elements);
7315
+ }, pc, elements);
7151
7316
  }
7152
7317
 
7153
7318
  static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
@@ -7529,6 +7694,37 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
7529
7694
  }, dryrun);
7530
7695
  }
7531
7696
 
7697
+ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
7698
+ // src0: (K, Cout, Cin, 1) -- kernel
7699
+ // src1: (L, Cin, 1, 1) -- input
7700
+ // dst: (*, Cout, 1, 1)
7701
+
7702
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7703
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7704
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7705
+
7706
+ GGML_TENSOR_BINARY_OP_LOCALS
7707
+
7708
+ GGML_ASSERT(nb00 == sizeof(float));
7709
+ GGML_ASSERT(nb10 == sizeof(float));
7710
+
7711
+ const int32_t s0 = dst->op_params[0];
7712
+
7713
+ vk_op_conv_transpose_1d_push_constants p{};
7714
+ p.Cout = static_cast<uint32_t>(ne01);
7715
+ p.Cin = static_cast<uint32_t>(ne02);
7716
+ p.K = static_cast<uint32_t>(ne00);
7717
+ p.L = static_cast<uint32_t>(ne10);
7718
+ p.KL = static_cast<uint32_t>(ne0);
7719
+ p.nb01 = static_cast<uint32_t>(nb01 / nb00);
7720
+ p.nb02 = static_cast<uint32_t>(nb02 / nb00);
7721
+ p.nb11 = static_cast<uint32_t>(nb11 / nb10);
7722
+ p.nb1 = static_cast<uint32_t>(nb1 / nb0);
7723
+ p.s0 = static_cast<uint32_t>(s0);
7724
+
7725
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun);
7726
+ }
7727
+
7532
7728
  static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7533
7729
  uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
7534
7730
  const int32_t k1 = dst->op_params[1];
@@ -7729,9 +7925,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
7729
7925
  }
7730
7926
  }
7731
7927
 
7732
- ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
7928
+ ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
7733
7929
  if (split_k > 1) {
7734
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
7930
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
7735
7931
 
7736
7932
  if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
7737
7933
  // Resize buffer
@@ -7746,7 +7942,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
7746
7942
  ggml_vk_load_shaders(ctx->device);
7747
7943
  }
7748
7944
 
7749
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
7945
+ ggml_pipeline_allocate_descriptor_sets(ctx);
7750
7946
 
7751
7947
  vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
7752
7948
  vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
@@ -7788,7 +7984,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
7788
7984
  ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
7789
7985
  ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
7790
7986
 
7791
- vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
7987
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
7792
7988
  ggml_vk_ctx_begin(ctx->device, subctx);
7793
7989
  for (size_t i = 0; i < num_it; i++) {
7794
7990
  ggml_vk_matmul(
@@ -7804,6 +8000,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
7804
8000
  ggml_vk_submit(subctx, ctx->fence);
7805
8001
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
7806
8002
  ctx->device->device.resetFences({ ctx->fence });
8003
+ ggml_vk_queue_command_pools_cleanup(ctx->device);
7807
8004
 
7808
8005
  auto end = std::chrono::high_resolution_clock::now();
7809
8006
  double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
@@ -7905,16 +8102,13 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
7905
8102
 
7906
8103
  free(d_chk);
7907
8104
 
7908
- ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
7909
- ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
8105
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
8106
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
7910
8107
 
7911
8108
  ggml_vk_destroy_buffer(d_X);
7912
8109
  ggml_vk_destroy_buffer(d_Y);
7913
8110
  ggml_vk_destroy_buffer(d_D);
7914
8111
 
7915
- ggml_pipeline_cleanup(p);
7916
- ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
7917
-
7918
8112
  free(x);
7919
8113
  free(y);
7920
8114
  free(d);
@@ -7992,20 +8186,20 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
7992
8186
  ggml_vk_quantize_data(x, qx, ne, quant);
7993
8187
  ggml_vk_dequantize_data(qx, x_ref, ne, quant);
7994
8188
 
7995
- ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
8189
+ ggml_pipeline_request_descriptor_sets(ctx, p, 1);
7996
8190
 
7997
8191
  if (ctx->device->need_compiles) {
7998
8192
  ggml_vk_load_shaders(ctx->device);
7999
8193
  }
8000
8194
 
8001
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
8195
+ ggml_pipeline_allocate_descriptor_sets(ctx);
8002
8196
 
8003
8197
  ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
8004
8198
 
8005
- vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
8199
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
8006
8200
  ggml_vk_ctx_begin(ctx->device, subctx);
8007
8201
  const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
8008
- ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
8202
+ ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
8009
8203
  ggml_vk_ctx_end(subctx);
8010
8204
 
8011
8205
  auto begin = std::chrono::high_resolution_clock::now();
@@ -8013,6 +8207,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
8013
8207
  ggml_vk_submit(subctx, ctx->fence);
8014
8208
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
8015
8209
  ctx->device->device.resetFences({ ctx->fence });
8210
+ ggml_vk_queue_command_pools_cleanup(ctx->device);
8016
8211
 
8017
8212
  auto end = std::chrono::high_resolution_clock::now();
8018
8213
 
@@ -8092,17 +8287,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
8092
8287
  //
8093
8288
  // vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
8094
8289
  //
8095
- // ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
8290
+ // ggml_pipeline_request_descriptor_sets(ctx, p, 1);
8096
8291
  //
8097
8292
  // if (ctx->device->need_compiles) {
8098
8293
  // ggml_vk_load_shaders(ctx->device);
8099
8294
  // }
8100
8295
  //
8101
- // ggml_pipeline_allocate_descriptor_sets(ctx->device);
8296
+ // ggml_pipeline_allocate_descriptor_sets(ctx);
8102
8297
  //
8103
8298
  // ggml_vk_buffer_write(x_buf, 0, x, x_sz);
8104
8299
  //
8105
- // vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
8300
+ // vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
8106
8301
  // ggml_vk_ctx_begin(ctx->device, subctx);
8107
8302
  // ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
8108
8303
  // ggml_vk_ctx_end(subctx);
@@ -8112,6 +8307,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
8112
8307
  // ggml_vk_submit(subctx, ctx->fence);
8113
8308
  // VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
8114
8309
  // ctx->device->device.resetFences({ ctx->fence });
8310
+ // ggml_vk_queue_command_pools_cleanup(ctx->device);
8115
8311
  //
8116
8312
  // auto end = std::chrono::high_resolution_clock::now();
8117
8313
  //
@@ -8251,9 +8447,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
8251
8447
  // y[i] = i % k;
8252
8448
  }
8253
8449
 
8254
- ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
8450
+ ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
8255
8451
  if (split_k > 1) {
8256
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
8452
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
8257
8453
 
8258
8454
  if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
8259
8455
  // Resize buffer
@@ -8264,19 +8460,19 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
8264
8460
  }
8265
8461
  }
8266
8462
  if (mmq) {
8267
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_quantize_q8_1, num_it);
8463
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
8268
8464
  }
8269
8465
 
8270
8466
  if (ctx->device->need_compiles) {
8271
8467
  ggml_vk_load_shaders(ctx->device);
8272
8468
  }
8273
8469
 
8274
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
8470
+ ggml_pipeline_allocate_descriptor_sets(ctx);
8275
8471
 
8276
8472
  ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
8277
8473
  ggml_vk_buffer_write(y_buf, 0, y, y_sz);
8278
8474
 
8279
- vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
8475
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
8280
8476
  ggml_vk_ctx_begin(ctx->device, subctx);
8281
8477
  if (mmq) {
8282
8478
  for (size_t i = 0; i < num_it; i++) {
@@ -8305,6 +8501,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
8305
8501
  ggml_vk_submit(subctx, ctx->fence);
8306
8502
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
8307
8503
  ctx->device->device.resetFences({ ctx->fence });
8504
+ ggml_vk_queue_command_pools_cleanup(ctx->device);
8308
8505
 
8309
8506
  auto end = std::chrono::high_resolution_clock::now();
8310
8507
 
@@ -8600,6 +8797,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8600
8797
  case GGML_OP_COUNT_EQUAL:
8601
8798
  case GGML_OP_IM2COL:
8602
8799
  case GGML_OP_TIMESTEP_EMBEDDING:
8800
+ case GGML_OP_CONV_TRANSPOSE_1D:
8603
8801
  case GGML_OP_POOL_2D:
8604
8802
  case GGML_OP_CONV_2D_DW:
8605
8803
  case GGML_OP_RWKV_WKV6:
@@ -8618,7 +8816,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8618
8816
 
8619
8817
  if (!dryrun) {
8620
8818
  if (ctx->compute_ctx.expired()) {
8621
- compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
8819
+ compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
8622
8820
  ctx->compute_ctx = compute_ctx;
8623
8821
  ggml_vk_ctx_begin(ctx->device, compute_ctx);
8624
8822
  } else {
@@ -8664,6 +8862,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8664
8862
  case GGML_OP_COUNT_EQUAL:
8665
8863
  case GGML_OP_IM2COL:
8666
8864
  case GGML_OP_TIMESTEP_EMBEDDING:
8865
+ case GGML_OP_CONV_TRANSPOSE_1D:
8667
8866
  case GGML_OP_POOL_2D:
8668
8867
  case GGML_OP_CONV_2D_DW:
8669
8868
  case GGML_OP_LEAKY_RELU:
@@ -8671,7 +8870,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8671
8870
  // These operations all go through ggml_vk_op_f32, so short-circuit and
8672
8871
  // do the only thing needed for the dryrun.
8673
8872
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
8674
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
8873
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
8675
8874
  return false;
8676
8875
  }
8677
8876
  default:
@@ -8835,6 +9034,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8835
9034
  case GGML_OP_TIMESTEP_EMBEDDING:
8836
9035
  ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
8837
9036
 
9037
+ break;
9038
+ case GGML_OP_CONV_TRANSPOSE_1D:
9039
+ ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun);
9040
+
8838
9041
  break;
8839
9042
  case GGML_OP_POOL_2D:
8840
9043
  ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
@@ -8963,6 +9166,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
8963
9166
  case GGML_OP_COUNT_EQUAL:
8964
9167
  case GGML_OP_IM2COL:
8965
9168
  case GGML_OP_TIMESTEP_EMBEDDING:
9169
+ case GGML_OP_CONV_TRANSPOSE_1D:
8966
9170
  case GGML_OP_POOL_2D:
8967
9171
  case GGML_OP_CONV_2D_DW:
8968
9172
  case GGML_OP_RWKV_WKV6:
@@ -9058,19 +9262,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
9058
9262
  }
9059
9263
  ctx->gc.temp_buffers.clear();
9060
9264
 
9061
- for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) {
9062
- vk_pipeline_ref plr = ctx->device->pipelines[dsr.first];
9063
-
9064
- if (plr.expired()) {
9065
- continue;
9066
- }
9067
-
9068
- vk_pipeline pl = plr.lock();
9069
- ggml_pipeline_cleanup(pl);
9070
- }
9071
-
9072
- ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
9073
- ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
9265
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
9266
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
9074
9267
 
9075
9268
  for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
9076
9269
  ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
@@ -9091,7 +9284,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
9091
9284
 
9092
9285
  ctx->tensor_ctxs.clear();
9093
9286
  ctx->gc.contexts.clear();
9094
- ctx->device->pipeline_descriptor_set_requirements.clear();
9287
+ ctx->pipeline_descriptor_set_requirements = 0;
9288
+ ctx->descriptor_set_idx = 0;
9095
9289
  }
9096
9290
 
9097
9291
  // Clean up on backend free
@@ -9118,6 +9312,15 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
9118
9312
 
9119
9313
  ctx->device->device.destroyFence(ctx->fence);
9120
9314
  ctx->device->device.destroyFence(ctx->almost_ready_fence);
9315
+
9316
+ for (auto& pool : ctx->descriptor_pools) {
9317
+ ctx->device->device.destroyDescriptorPool(pool);
9318
+ }
9319
+ ctx->descriptor_pools.clear();
9320
+ ctx->descriptor_sets.clear();
9321
+
9322
+ ctx->compute_cmd_pool.destroy(ctx->device->device);
9323
+ ctx->transfer_cmd_pool.destroy(ctx->device->device);
9121
9324
  }
9122
9325
 
9123
9326
  static int ggml_vk_get_device_count() {
@@ -9325,6 +9528,12 @@ static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer
9325
9528
  UNUSED(buft);
9326
9529
  }
9327
9530
 
9531
+ static size_t ggml_backend_vk_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
9532
+ return vk_instance.devices[0]->suballocation_block_size;
9533
+
9534
+ UNUSED(buft);
9535
+ }
9536
+
9328
9537
  // Should be changed to return device-specific host buffer type
9329
9538
  // but that probably requires changes in llama.cpp
9330
9539
  ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
@@ -9333,7 +9542,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
9333
9542
  /* .get_name = */ ggml_backend_vk_host_buffer_type_name,
9334
9543
  /* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer,
9335
9544
  /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
9336
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
9545
+ /* .get_max_size = */ ggml_backend_vk_host_buffer_type_get_max_size,
9337
9546
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9338
9547
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
9339
9548
  },
@@ -9384,7 +9593,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
9384
9593
 
9385
9594
  if (ctx->transfer_ctx.expired()) {
9386
9595
  // Initialize new transfer context
9387
- transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
9596
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
9388
9597
  ctx->transfer_ctx = transfer_ctx;
9389
9598
  ggml_vk_ctx_begin(ctx->device, transfer_ctx);
9390
9599
  } else {
@@ -9407,7 +9616,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
9407
9616
 
9408
9617
  if (ctx->transfer_ctx.expired()) {
9409
9618
  // Initialize new transfer context
9410
- transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
9619
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
9411
9620
  ctx->transfer_ctx = transfer_ctx;
9412
9621
  ggml_vk_ctx_begin(ctx->device, transfer_ctx);
9413
9622
  } else {
@@ -9430,7 +9639,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
9430
9639
 
9431
9640
  if (ctx->transfer_ctx.expired()) {
9432
9641
  // Initialize new transfer context
9433
- transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
9642
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
9434
9643
  ctx->transfer_ctx = transfer_ctx;
9435
9644
  ggml_vk_ctx_begin(ctx->device, transfer_ctx);
9436
9645
  } else {
@@ -9480,6 +9689,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9480
9689
  VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
9481
9690
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
9482
9691
 
9692
+ if (vk_instance.debug_utils_support) {
9693
+ vk::DebugUtilsLabelEXT dul = {};
9694
+ dul.pLabelName = "ggml_backend_vk_graph_compute";
9695
+ dul.color = std::array<float,4>{1.0f, 1.0f, 1.0f, 1.0f};
9696
+ vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast<VkDebugUtilsLabelEXT*>(&dul));
9697
+ }
9698
+
9483
9699
  uint64_t total_mat_mul_bytes = 0;
9484
9700
  for (int i = 0; i < cgraph->n_nodes; i++) {
9485
9701
  ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false);
@@ -9491,7 +9707,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9491
9707
  ggml_vk_load_shaders(ctx->device);
9492
9708
  }
9493
9709
  ggml_vk_preallocate_buffers(ctx);
9494
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
9710
+ ggml_pipeline_allocate_descriptor_sets(ctx);
9495
9711
 
9496
9712
  int last_node = cgraph->n_nodes - 1;
9497
9713
 
@@ -9513,8 +9729,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9513
9729
  if (ctx->device->query_pool) {
9514
9730
  ctx->device->device.destroyQueryPool(ctx->device->query_pool);
9515
9731
  }
9516
- VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
9517
- query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
9732
+ vk::QueryPoolCreateInfo query_create_info;
9733
+ query_create_info.queryType = vk::QueryType::eTimestamp;
9518
9734
  query_create_info.queryCount = cgraph->n_nodes + 100;
9519
9735
  ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
9520
9736
  ctx->device->num_queries = query_create_info.queryCount;
@@ -9523,7 +9739,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9523
9739
  ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
9524
9740
 
9525
9741
  GGML_ASSERT(ctx->compute_ctx.expired());
9526
- compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
9742
+ compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
9527
9743
  ctx->compute_ctx = compute_ctx;
9528
9744
  ggml_vk_ctx_begin(ctx->device, compute_ctx);
9529
9745
  compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
@@ -9558,7 +9774,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9558
9774
 
9559
9775
  if (vk_perf_logger_enabled) {
9560
9776
  if (ctx->compute_ctx.expired()) {
9561
- compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
9777
+ compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
9562
9778
  ctx->compute_ctx = compute_ctx;
9563
9779
  ggml_vk_ctx_begin(ctx->device, compute_ctx);
9564
9780
  } else {
@@ -9600,7 +9816,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9600
9816
 
9601
9817
  // Get the results and pass them to the logger
9602
9818
  std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
9603
- ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
9819
+ VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
9604
9820
  for (int i = 0; i < cgraph->n_nodes; i++) {
9605
9821
  if (!ggml_vk_is_empty(cgraph->nodes[i])) {
9606
9822
  ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
@@ -10024,6 +10240,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
10024
10240
  case GGML_OP_LEAKY_RELU:
10025
10241
  case GGML_OP_OPT_STEP_ADAMW:
10026
10242
  return true;
10243
+ case GGML_OP_CONV_TRANSPOSE_1D:
10244
+ return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
10027
10245
  default:
10028
10246
  return false;
10029
10247
  }
@@ -10167,11 +10385,28 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
10167
10385
  UNUSED(instance_extensions);
10168
10386
  }
10169
10387
 
10388
+ // Extension availability
10389
+ static bool ggml_vk_instance_debug_utils_ext_available(
10390
+ const std::vector<vk::ExtensionProperties> & instance_extensions) {
10391
+ // Check for portability enumeration extension for MoltenVK support
10392
+ for (const auto & properties : instance_extensions) {
10393
+ if (strcmp("VK_EXT_debug_utils", properties.extensionName) == 0) {
10394
+ return true;
10395
+ }
10396
+ }
10397
+
10398
+ std::cerr << "ggml_vulkan: WARNING: Instance extension VK_EXT_debug_utils not found." << std::endl;
10399
+ return false;
10400
+
10401
+ UNUSED(instance_extensions);
10402
+ }
10403
+
10170
10404
  static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
10171
10405
  switch (props.vendorID) {
10172
10406
  case VK_VENDOR_ID_INTEL:
10173
- // Intel drivers don't support coopmat properly yet
10174
- return false;
10407
+ // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost,
10408
+ // while some older hardware (ex. Arc A770) has performance regressions
10409
+ return arch == vk_device_architecture::INTEL_XE2;
10175
10410
  case VK_VENDOR_ID_AMD:
10176
10411
  if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
10177
10412
  // Workaround for AMD proprietary driver reporting support on all GPUs
@@ -10515,6 +10750,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
10515
10750
  const int32_t dim = tensor->op_params[0];
10516
10751
  const int32_t max_period = tensor->op_params[1];
10517
10752
  tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
10753
+ } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
10754
+ const int32_t s0 = tensor->op_params[0];
10755
+ const int32_t p0 = tensor->op_params[1];
10756
+ const int32_t d0 = tensor->op_params[2];
10757
+ tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
10518
10758
  } else if (tensor->op == GGML_OP_POOL_2D) {
10519
10759
  enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
10520
10760
  const int32_t k0 = tensor->op_params[1];