@novastera-oss/llamarn 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/RNLlamaCpp.podspec +3 -2
  2. package/android/CMakeLists.txt +6 -3
  3. package/android/src/main/cpp/include/llama.h +140 -38
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  12. package/cpp/LlamaCppModel.cpp +48 -67
  13. package/cpp/LlamaCppModel.h +8 -3
  14. package/cpp/PureCppImpl.cpp +1 -1
  15. package/cpp/PureCppImpl.h +2 -2
  16. package/cpp/build-info.cpp +2 -2
  17. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  18. package/cpp/llama.cpp/Makefile +2 -2
  19. package/cpp/llama.cpp/README.md +33 -13
  20. package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
  21. package/cpp/llama.cpp/common/arg.cpp +38 -12
  22. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  23. package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
  24. package/cpp/llama.cpp/common/chat-parser.h +4 -1
  25. package/cpp/llama.cpp/common/chat.cpp +16 -13
  26. package/cpp/llama.cpp/common/chat.h +1 -1
  27. package/cpp/llama.cpp/common/common.cpp +52 -40
  28. package/cpp/llama.cpp/common/common.h +5 -2
  29. package/cpp/llama.cpp/common/json-partial.cpp +5 -4
  30. package/cpp/llama.cpp/common/json-partial.h +2 -1
  31. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  32. package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
  33. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  34. package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
  35. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  36. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  37. package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
  38. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
  39. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
  41. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  44. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  79. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  82. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  83. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  93. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  94. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
  109. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  112. package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
  113. package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
  114. package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
  115. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  116. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  117. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  118. package/cpp/llama.cpp/include/llama.h +140 -38
  119. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  120. package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
  121. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  122. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  123. package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
  124. package/cpp/llama.cpp/src/llama-batch.h +47 -17
  125. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  126. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  127. package/cpp/llama.cpp/src/llama-context.cpp +488 -313
  128. package/cpp/llama.cpp/src/llama-context.h +38 -17
  129. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  130. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  131. package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
  132. package/cpp/llama.cpp/src/llama-graph.h +109 -52
  133. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  134. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  135. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
  136. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
  137. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
  138. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
  139. package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
  140. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  141. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  142. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
  143. package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
  144. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  145. package/cpp/llama.cpp/src/llama-memory.h +89 -4
  146. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  147. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  148. package/cpp/llama.cpp/src/llama-model.cpp +735 -143
  149. package/cpp/llama.cpp/src/llama-model.h +4 -0
  150. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  151. package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
  152. package/cpp/llama.cpp/src/llama.cpp +11 -7
  153. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  154. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
  155. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
  156. package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
  157. package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
  158. package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
  159. package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  160. package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
  161. package/cpp/rn-completion.cpp +65 -10
  162. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  163. package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
  164. package/ios/include/chat.h +1 -1
  165. package/ios/include/common/minja/chat-template.hpp +1 -1
  166. package/ios/include/common/minja/minja.hpp +1 -1
  167. package/ios/include/common.h +5 -2
  168. package/ios/include/json-schema-to-grammar.h +4 -4
  169. package/ios/include/llama.h +140 -38
  170. package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
  171. package/ios/libs/llama.xcframework/Info.plist +20 -20
  172. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  173. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
  174. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
  175. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
  176. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  177. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  178. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
  179. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
  180. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  181. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  182. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  184. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
  185. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
  186. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
  187. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
  188. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
  189. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
  190. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  191. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
  192. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
  193. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  194. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  195. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  196. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
  197. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
  198. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
  199. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  200. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  201. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
  202. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
  203. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  204. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  205. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  206. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  207. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
  208. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
  209. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
  210. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  211. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  212. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
  213. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
  214. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  215. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  216. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  217. package/package.json +1 -2
  218. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  219. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  220. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  221. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
  222. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
  223. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  224. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  225. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -78,7 +78,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
78
78
  #define VK_VENDOR_ID_INTEL 0x8086
79
79
  #define VK_VENDOR_ID_NVIDIA 0x10de
80
80
 
81
- #define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32
81
+ #define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
82
82
 
83
83
  #define GGML_VK_MAX_NODES 8192
84
84
 
@@ -102,25 +102,11 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
102
102
 
103
103
  struct ggml_backend_vk_context;
104
104
 
105
- struct vk_queue {
106
- uint32_t queue_family_index;
107
- vk::Queue queue;
108
- vk::CommandPool pool;
109
- uint32_t cmd_buffer_idx;
110
- std::vector<vk::CommandBuffer> cmd_buffers;
111
-
112
- vk::PipelineStageFlags stage_flags;
113
-
114
- bool transfer_only;
115
- };
105
+ #define MAX_PARAMETER_COUNT 8
116
106
 
117
107
  struct vk_pipeline_struct {
118
108
  std::string name;
119
109
  vk::ShaderModule shader_module;
120
- vk::DescriptorSetLayout dsl;
121
- std::vector<vk::DescriptorPool> descriptor_pools;
122
- std::vector<vk::DescriptorSet> descriptor_sets;
123
- uint32_t descriptor_set_idx;
124
110
  vk::PipelineLayout layout;
125
111
  vk::Pipeline pipeline;
126
112
  uint32_t push_constant_size;
@@ -167,6 +153,45 @@ struct ggml_backend_vk_buffer_type_context {
167
153
  vk_device device;
168
154
  };
169
155
 
156
+ struct vk_queue;
157
+
158
+ // Stores command pool/buffers. There's an instance of this
159
+ // for each (context,queue) pair and for each (device,queue) pair.
160
+ struct vk_command_pool {
161
+ void init(vk_device& device, vk_queue *q_);
162
+ void destroy(vk::Device& device);
163
+
164
+ vk::CommandPool pool;
165
+ uint32_t cmd_buffer_idx;
166
+ std::vector<vk::CommandBuffer> cmd_buffers;
167
+
168
+ vk_queue *q;
169
+ };
170
+
171
+ // Prevent simultaneous submissions to the same queue.
172
+ // This could be per vk_queue if we stopped having two vk_queue structures
173
+ // sharing the same vk::Queue.
174
+ static std::mutex queue_mutex;
175
+
176
+ struct vk_queue {
177
+ uint32_t queue_family_index;
178
+ vk::Queue queue;
179
+
180
+ vk_command_pool cmd_pool;
181
+
182
+ vk::PipelineStageFlags stage_flags;
183
+
184
+ bool transfer_only;
185
+
186
+ // copy everything except the cmd_pool
187
+ void copyFrom(vk_queue &other) {
188
+ queue_family_index = other.queue_family_index;
189
+ queue = other.queue;
190
+ stage_flags = other.stage_flags;
191
+ transfer_only = other.transfer_only;
192
+ }
193
+ };
194
+
170
195
  static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
171
196
  static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
172
197
  static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
@@ -196,6 +221,7 @@ enum vk_device_architecture {
196
221
  AMD_RDNA1,
197
222
  AMD_RDNA2,
198
223
  AMD_RDNA3,
224
+ INTEL_XE2,
199
225
  };
200
226
 
201
227
  static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
@@ -246,6 +272,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
246
272
  }
247
273
  return vk_device_architecture::AMD_RDNA2;
248
274
  }
275
+ } else if (props.vendorID == VK_VENDOR_ID_INTEL) {
276
+ const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
277
+
278
+ bool subgroup_size_control = false;
279
+
280
+ for (const auto& properties : ext_props) {
281
+ if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
282
+ subgroup_size_control = true;
283
+ }
284
+ }
285
+
286
+ if (!subgroup_size_control) {
287
+ return vk_device_architecture::OTHER;
288
+ }
289
+
290
+ vk::PhysicalDeviceProperties2 props2;
291
+ vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
292
+
293
+ props2.pNext = &subgroup_size_control_props;
294
+ device.getProperties2(&props2);
295
+
296
+ if (subgroup_size_control_props.minSubgroupSize == 16) {
297
+ // Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8.
298
+ // Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value.
299
+ // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
300
+ // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
301
+ return vk_device_architecture::INTEL_XE2;
302
+ }
249
303
  }
250
304
  return vk_device_architecture::OTHER;
251
305
  }
@@ -312,6 +366,8 @@ struct vk_device_struct {
312
366
  // set to true to indicate that some shaders need to be compiled after the dryrun
313
367
  bool need_compiles {};
314
368
 
369
+ vk::DescriptorSetLayout dsl;
370
+
315
371
  vk_matmul_pipeline pipeline_matmul_f32 {};
316
372
  vk_matmul_pipeline pipeline_matmul_f32_f16 {};
317
373
  vk_matmul_pipeline pipeline_matmul_bf16 {};
@@ -396,6 +452,7 @@ struct vk_device_struct {
396
452
  vk_pipeline pipeline_count_equal_i32;
397
453
  vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
398
454
  vk_pipeline pipeline_timestep_embedding_f32;
455
+ vk_pipeline pipeline_conv_transpose_1d_f32;
399
456
  vk_pipeline pipeline_pool2d_f32;
400
457
  vk_pipeline pipeline_rwkv_wkv6_f32;
401
458
  vk_pipeline pipeline_rwkv_wkv7_f32;
@@ -428,7 +485,6 @@ struct vk_device_struct {
428
485
  vk_pipeline pipeline_flash_attn_split_k_reduce;
429
486
 
430
487
  std::unordered_map<std::string, vk_pipeline_ref> pipelines;
431
- std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
432
488
 
433
489
  std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
434
490
 
@@ -444,7 +500,7 @@ struct vk_device_struct {
444
500
  // for GGML_VK_PERF_LOGGER
445
501
  std::unique_ptr<vk_perf_logger> perf_logger;
446
502
  vk::QueryPool query_pool;
447
- uint32_t num_queries;
503
+ int32_t num_queries;
448
504
 
449
505
  ~vk_device_struct() {
450
506
  VK_LOG_DEBUG("destroy device " << name);
@@ -453,10 +509,8 @@ struct vk_device_struct {
453
509
 
454
510
  ggml_vk_destroy_buffer(sync_staging);
455
511
 
456
- device.destroyCommandPool(compute_queue.pool);
457
- if (!single_queue) {
458
- device.destroyCommandPool(transfer_queue.pool);
459
- }
512
+ compute_queue.cmd_pool.destroy(device);
513
+ transfer_queue.cmd_pool.destroy(device);
460
514
 
461
515
  for (auto& pipeline : pipelines) {
462
516
  if (pipeline.second.expired()) {
@@ -468,10 +522,26 @@ struct vk_device_struct {
468
522
  }
469
523
  pipelines.clear();
470
524
 
525
+ device.destroyDescriptorSetLayout(dsl);
526
+
471
527
  device.destroy();
472
528
  }
473
529
  };
474
530
 
531
+ void vk_command_pool::init(vk_device& device, vk_queue *q_) {
532
+ cmd_buffer_idx = 0;
533
+ q = q_;
534
+
535
+ vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
536
+ pool = device->device.createCommandPool(command_pool_create_info);
537
+ }
538
+
539
+ void vk_command_pool::destroy(vk::Device& device) {
540
+ device.destroyCommandPool(pool);
541
+ pool = nullptr;
542
+ cmd_buffers.clear();
543
+ }
544
+
475
545
  struct vk_buffer_struct {
476
546
  vk::Buffer buffer = VK_NULL_HANDLE;
477
547
  vk::DeviceMemory device_memory = VK_NULL_HANDLE;
@@ -706,6 +776,21 @@ struct vk_op_timestep_embedding_push_constants {
706
776
  uint32_t max_period;
707
777
  };
708
778
 
779
+ struct vk_op_conv_transpose_1d_push_constants {
780
+ uint32_t Cout;
781
+ uint32_t Cin;
782
+ uint32_t K;
783
+ uint32_t L;
784
+ uint32_t KL;
785
+
786
+ uint32_t nb01;
787
+ uint32_t nb02;
788
+ uint32_t nb11;
789
+ uint32_t nb1;
790
+
791
+ int32_t s0;
792
+ };
793
+
709
794
  struct vk_op_pool2d_push_constants {
710
795
  uint32_t IW; uint32_t IH;
711
796
  uint32_t OW; uint32_t OH;
@@ -774,7 +859,7 @@ struct vk_context_struct {
774
859
  std::vector<vk_staging_memcpy> in_memcpys;
775
860
  std::vector<vk_staging_memcpy> out_memcpys;
776
861
 
777
- vk_queue * q;
862
+ vk_command_pool * p {};
778
863
  };
779
864
  typedef std::shared_ptr<vk_context_struct> vk_context;
780
865
  typedef std::weak_ptr<vk_context_struct> vk_context_ref;
@@ -885,6 +970,14 @@ struct ggml_backend_vk_context {
885
970
  vk_context_ref transfer_ctx;
886
971
 
887
972
  std::vector<vk_context_ref> tensor_ctxs;
973
+
974
+ std::vector<vk::DescriptorPool> descriptor_pools;
975
+ std::vector<vk::DescriptorSet> descriptor_sets;
976
+ uint32_t descriptor_set_idx {};
977
+ uint32_t pipeline_descriptor_set_requirements {};
978
+
979
+ vk_command_pool compute_cmd_pool;
980
+ vk_command_pool transfer_cmd_pool;
888
981
  };
889
982
 
890
983
  static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
@@ -1015,39 +1108,19 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
1015
1108
  ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
1016
1109
  disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
1017
1110
  GGML_ASSERT(parameter_count > 0);
1111
+ GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT);
1018
1112
  GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
1019
1113
 
1020
1114
  vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
1021
1115
  pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
1022
1116
 
1023
- std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
1024
- std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
1025
- for (uint32_t i = 0; i < parameter_count; i++) {
1026
- dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
1027
- dsl_binding_flags.push_back({});
1028
- }
1029
-
1030
- vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
1031
-
1032
1117
  vk::PushConstantRange pcr(
1033
1118
  vk::ShaderStageFlagBits::eCompute,
1034
1119
  0,
1035
1120
  pipeline->push_constant_size
1036
1121
  );
1037
1122
 
1038
- vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
1039
- {},
1040
- dsl_binding);
1041
- descriptor_set_layout_create_info.setPNext(&dslbfci);
1042
- pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
1043
-
1044
- vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
1045
- vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
1046
- pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
1047
-
1048
- pipeline->descriptor_set_idx = 0;
1049
-
1050
- vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
1123
+ vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr);
1051
1124
  pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
1052
1125
 
1053
1126
  std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
@@ -1122,15 +1195,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
1122
1195
 
1123
1196
  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
1124
1197
  VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
1125
- for (auto& pool : pipeline->descriptor_pools) {
1126
- device.destroyDescriptorPool(pool);
1127
- }
1128
- pipeline->descriptor_pools.clear();
1129
- pipeline->descriptor_sets.clear();
1130
- pipeline->descriptor_set_idx = 0;
1131
-
1132
- device.destroyDescriptorSetLayout(pipeline->dsl);
1133
-
1134
1198
  device.destroyPipelineLayout(pipeline->layout);
1135
1199
 
1136
1200
  device.destroyShaderModule(pipeline->shader_module);
@@ -1138,97 +1202,77 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
1138
1202
  device.destroyPipeline(pipeline->pipeline);
1139
1203
  }
1140
1204
 
1141
- static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
1205
+ static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
1142
1206
  VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
1143
- device->pipeline_descriptor_set_requirements[pipeline->name] += n;
1207
+ ctx->pipeline_descriptor_set_requirements += n;
1144
1208
  if (!pipeline->compiled) {
1145
1209
  pipeline->needed = true;
1146
- device->need_compiles = true;
1210
+ ctx->device->need_compiles = true;
1147
1211
  }
1148
1212
  }
1149
1213
 
1150
- static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
1151
- std::lock_guard<std::mutex> guard(device->mutex);
1152
-
1153
- for (auto& pair : device->pipeline_descriptor_set_requirements) {
1154
- vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
1155
- const uint64_t n = pair.second;
1156
-
1157
- VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
1214
+ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
1158
1215
 
1159
- if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
1160
- // Enough descriptors are available
1161
- continue;
1162
- }
1216
+ if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
1217
+ // Enough descriptors are available
1218
+ return;
1219
+ }
1163
1220
 
1164
- uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
1165
- uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1166
- uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1221
+ vk_device& device = ctx->device;
1167
1222
 
1168
- while (to_alloc > 0) {
1169
- const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
1170
- to_alloc -= alloc_count;
1171
- pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1223
+ uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
1224
+ uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1225
+ uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1172
1226
 
1173
- if (pool_idx >= pipeline->descriptor_pools.size()) {
1174
- vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
1175
- vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
1176
- pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
1177
- }
1227
+ while (to_alloc > 0) {
1228
+ const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
1229
+ to_alloc -= alloc_count;
1230
+ pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
1178
1231
 
1179
- std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
1180
- for (uint32_t i = 0; i < alloc_count; i++) {
1181
- layouts[i] = pipeline->dsl;
1182
- }
1183
- vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data());
1184
- std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
1185
- pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
1232
+ if (pool_idx >= ctx->descriptor_pools.size()) {
1233
+ vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
1234
+ vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
1235
+ ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
1236
+ }
1186
1237
 
1187
- pool_idx++;
1238
+ std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
1239
+ for (uint32_t i = 0; i < alloc_count; i++) {
1240
+ layouts[i] = device->dsl;
1188
1241
  }
1189
- }
1190
- }
1242
+ vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
1243
+ std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
1244
+ ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
1191
1245
 
1192
- static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
1193
- VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
1194
- pipeline->descriptor_set_idx = 0;
1246
+ pool_idx++;
1247
+ }
1195
1248
  }
1196
1249
 
1197
- static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_queue& q) {
1250
+ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
1198
1251
  VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
1199
- std::lock_guard<std::mutex> guard(device->mutex);
1200
1252
 
1201
- if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
1253
+ if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
1202
1254
  // Reuse command buffer
1203
- return q.cmd_buffers[q.cmd_buffer_idx++];
1255
+ return p.cmd_buffers[p.cmd_buffer_idx++];
1204
1256
  }
1205
1257
 
1206
1258
  vk::CommandBufferAllocateInfo command_buffer_alloc_info(
1207
- q.pool,
1259
+ p.pool,
1208
1260
  vk::CommandBufferLevel::ePrimary,
1209
1261
  1);
1210
1262
  const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
1211
1263
  auto buf = cmd_buffers.front();
1212
1264
 
1213
- q.cmd_buffers.push_back(buf);
1214
- q.cmd_buffer_idx++;
1265
+ p.cmd_buffers.push_back(buf);
1266
+ p.cmd_buffer_idx++;
1215
1267
 
1216
1268
  return buf;
1217
1269
  }
1218
1270
 
1219
- static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
1220
- VK_LOG_DEBUG("ggml_vk_create_submission()");
1221
- vk_submission s;
1222
- s.buffer = ggml_vk_create_cmd_buffer(device, q);
1223
- s.wait_semaphores = std::move(wait_semaphores);
1224
- s.signal_semaphores = std::move(signal_semaphores);
1225
- return s;
1226
- }
1227
-
1228
1271
  static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
1229
1272
  if (ctx->seqs.empty()) {
1230
1273
  if (fence) {
1231
- ctx->q->queue.submit({}, fence);
1274
+ std::lock_guard<std::mutex> guard(queue_mutex);
1275
+ ctx->p->q->queue.submit({}, fence);
1232
1276
  }
1233
1277
  return;
1234
1278
  }
@@ -1267,7 +1311,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
1267
1311
  tl_signal_vals.push_back({});
1268
1312
  tl_signal_semaphores.push_back({});
1269
1313
  for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
1270
- stage_flags[idx].push_back(ctx->q->stage_flags);
1314
+ stage_flags[idx].push_back(ctx->p->q->stage_flags);
1271
1315
  tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
1272
1316
  tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
1273
1317
  }
@@ -1297,7 +1341,8 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
1297
1341
  }
1298
1342
  }
1299
1343
 
1300
- ctx->q->queue.submit(submit_infos, fence);
1344
+ std::lock_guard<std::mutex> guard(queue_mutex);
1345
+ ctx->p->q->queue.submit(submit_infos, fence);
1301
1346
 
1302
1347
  ctx->seqs.clear();
1303
1348
  }
@@ -1355,28 +1400,25 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
1355
1400
  q.queue_family_index = queue_family_index;
1356
1401
  q.transfer_only = transfer_only;
1357
1402
 
1358
- vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
1359
- q.pool = device->device.createCommandPool(command_pool_create_info_compute);
1360
-
1361
- q.cmd_buffer_idx = 0;
1403
+ q.cmd_pool.init(device, &q);
1362
1404
 
1363
1405
  q.queue = device->device.getQueue(queue_family_index, queue_index);
1364
1406
 
1365
1407
  q.stage_flags = stage_flags;
1366
1408
  }
1367
1409
 
1368
- static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
1410
+ static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
1369
1411
  vk_context result = std::make_shared<vk_context_struct>();
1370
1412
  VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
1371
1413
  ctx->gc.contexts.emplace_back(result);
1372
- result->q = &q;
1414
+ result->p = &p;
1373
1415
  return result;
1374
1416
  }
1375
1417
 
1376
- static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
1418
+ static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
1377
1419
  vk_context result = std::make_shared<vk_context_struct>();
1378
1420
  VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
1379
- result->q = &q;
1421
+ result->p = &p;
1380
1422
  return result;
1381
1423
  }
1382
1424
 
@@ -1409,15 +1451,29 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
1409
1451
  return ctx->gc.events[ctx->event_idx++];
1410
1452
  }
1411
1453
 
1412
- static void ggml_vk_queue_cleanup(vk_device& device, vk_queue& q) {
1413
- VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
1414
- std::lock_guard<std::mutex> guard(device->mutex);
1454
+ static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
1455
+ VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
1415
1456
 
1416
1457
  // Requires command buffers to be done
1417
- device->device.resetCommandPool(q.pool);
1418
- q.cmd_buffer_idx = 0;
1458
+ device->device.resetCommandPool(p.pool);
1459
+ p.cmd_buffer_idx = 0;
1460
+ }
1461
+
1462
+ static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
1463
+ VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
1464
+
1465
+ // Arbitrary frequency to cleanup/reuse command buffers
1466
+ static constexpr uint32_t cleanup_frequency = 10;
1467
+
1468
+ if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
1469
+ ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
1470
+ }
1471
+ if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
1472
+ ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
1473
+ }
1419
1474
  }
1420
1475
 
1476
+
1421
1477
  static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
1422
1478
  for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
1423
1479
  vk::MemoryType memory_type = mem_props->memoryTypes[i];
@@ -1436,8 +1492,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
1436
1492
  throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
1437
1493
  }
1438
1494
 
1439
- std::lock_guard<std::mutex> guard(device->mutex);
1440
-
1441
1495
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
1442
1496
 
1443
1497
  if (size == 0) {
@@ -1566,11 +1620,11 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
1566
1620
  static void ggml_vk_sync_buffers(vk_context& ctx) {
1567
1621
  VK_LOG_DEBUG("ggml_vk_sync_buffers()");
1568
1622
 
1569
- const bool transfer_queue = ctx->q->transfer_only;
1623
+ const bool transfer_queue = ctx->p->q->transfer_only;
1570
1624
 
1571
1625
  ctx->s->buffer.pipelineBarrier(
1572
- ctx->q->stage_flags,
1573
- ctx->q->stage_flags,
1626
+ ctx->p->q->stage_flags,
1627
+ ctx->p->q->stage_flags,
1574
1628
  {},
1575
1629
  { {
1576
1630
  { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
@@ -1589,8 +1643,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
1589
1643
 
1590
1644
  ctx->s->buffer.waitEvents(
1591
1645
  events,
1592
- ctx->q->stage_flags,
1593
- ctx->q->stage_flags,
1646
+ ctx->p->q->stage_flags,
1647
+ ctx->p->q->stage_flags,
1594
1648
  {},
1595
1649
  {},
1596
1650
  {}
@@ -1652,7 +1706,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_
1652
1706
  return {64, 32};
1653
1707
  }
1654
1708
  return {64, 64};
1655
- };
1709
+ }
1656
1710
 
1657
1711
  static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {
1658
1712
 
@@ -2726,6 +2780,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
2726
2780
 
2727
2781
  ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
2728
2782
 
2783
+ ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
2784
+
2729
2785
  ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
2730
2786
 
2731
2787
  ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
@@ -3322,6 +3378,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
3322
3378
  }
3323
3379
  }
3324
3380
 
3381
+
3382
+ std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
3383
+ std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
3384
+ for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) {
3385
+ dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
3386
+ dsl_binding_flags.push_back({});
3387
+ }
3388
+
3389
+ vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
3390
+
3391
+ vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
3392
+ {},
3393
+ dsl_binding);
3394
+ descriptor_set_layout_create_info.setPNext(&dslbfci);
3395
+ device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
3396
+
3325
3397
  ggml_vk_load_shaders(device);
3326
3398
 
3327
3399
  if (!device->single_queue) {
@@ -3329,7 +3401,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
3329
3401
  ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
3330
3402
  } else {
3331
3403
  // TODO: Use pointer or reference to avoid copy
3332
- device->transfer_queue = device->compute_queue;
3404
+ device->transfer_queue.copyFrom(device->compute_queue);
3405
+ device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
3333
3406
  }
3334
3407
 
3335
3408
  device->buffer_type = {
@@ -3548,11 +3621,11 @@ static void ggml_vk_instance_init() {
3548
3621
 
3549
3622
  vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
3550
3623
 
3551
- size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
3552
-
3553
3624
  // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
3554
3625
  char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES");
3555
3626
  if (devices_env != nullptr) {
3627
+ size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
3628
+
3556
3629
  std::string devices(devices_env);
3557
3630
  std::replace(devices.begin(), devices.end(), ',', ' ');
3558
3631
 
@@ -3568,9 +3641,9 @@ static void ggml_vk_instance_init() {
3568
3641
  } else {
3569
3642
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
3570
3643
 
3571
- // Make sure at least one device exists
3644
+ // If no vulkan devices are found, return early
3572
3645
  if (devices.empty()) {
3573
- std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
3646
+ GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
3574
3647
  return;
3575
3648
  }
3576
3649
 
@@ -3653,9 +3726,20 @@ static void ggml_vk_instance_init() {
3653
3726
  }
3654
3727
  }
3655
3728
 
3656
- // If no dedicated GPUs found, fall back to GPU 0
3729
+ // If no dedicated GPUs found, fall back to the first non-CPU device.
3730
+ // If only CPU devices are available, return without devices.
3657
3731
  if (vk_instance.device_indices.empty()) {
3658
- vk_instance.device_indices.push_back(0);
3732
+ for (size_t i = 0; i < devices.size(); i++) {
3733
+ if (devices[i].getProperties().deviceType != vk::PhysicalDeviceType::eCpu) {
3734
+ vk_instance.device_indices.push_back(i);
3735
+ break;
3736
+ }
3737
+ }
3738
+ }
3739
+
3740
+ if (vk_instance.device_indices.empty()) {
3741
+ GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
3742
+ return;
3659
3743
  }
3660
3744
  }
3661
3745
  GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size());
@@ -3684,6 +3768,9 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
3684
3768
  ctx->fence = ctx->device->device.createFence({});
3685
3769
  ctx->almost_ready_fence = ctx->device->device.createFence({});
3686
3770
 
3771
+ ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
3772
+ ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
3773
+
3687
3774
  #ifdef GGML_VULKAN_CHECK_RESULTS
3688
3775
  const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
3689
3776
  vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -4049,9 +4136,9 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf
4049
4136
  }
4050
4137
  }
4051
4138
 
4052
- static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bool one_time = true) {
4139
+ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
4053
4140
  vk_submission s;
4054
- s.buffer = ggml_vk_create_cmd_buffer(device, q);
4141
+ s.buffer = ggml_vk_create_cmd_buffer(device, p);
4055
4142
  if (one_time) {
4056
4143
  s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
4057
4144
  } else {
@@ -4061,7 +4148,33 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
4061
4148
  return s;
4062
4149
  }
4063
4150
 
4064
- static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
4151
+ template <typename T> size_t push_constant_size(const T &t) {
4152
+ static_assert(std::is_class<T>::value, "T must be a struct/class");
4153
+ GGML_UNUSED(t);
4154
+ return sizeof(T);
4155
+ }
4156
+ template <typename T> size_t push_constant_size(const std::vector<T> &t) {
4157
+ GGML_UNUSED(t);
4158
+ return sizeof(T) * t.size();
4159
+ }
4160
+ template <typename T, uint32_t N> size_t push_constant_size(const std::array<T, N> &t) {
4161
+ GGML_UNUSED(t);
4162
+ return sizeof(T) * N;
4163
+ }
4164
+
4165
+ template <typename T> const T *push_constant_data(const T &t) {
4166
+ static_assert(std::is_class<T>::value, "T must be a struct/class");
4167
+ return &t;
4168
+ }
4169
+ template <typename T> const T *push_constant_data(const std::vector<T> &t) {
4170
+ return t.data();
4171
+ }
4172
+ template <typename T, uint32_t N> const T *push_constant_data(const std::array<T, N> &t) {
4173
+ return t.data();
4174
+ }
4175
+
4176
+ template <typename T>
4177
+ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, const T &push_constants, std::array<uint32_t, 3> elements) {
4065
4178
  const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
4066
4179
  const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
4067
4180
  const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
@@ -4070,14 +4183,14 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
4070
4183
  std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
4071
4184
  }
4072
4185
  std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
4073
- GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
4074
- GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
4186
+ GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
4187
+ GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
4075
4188
 
4076
- vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
4189
+ vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
4077
4190
  vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
4078
4191
  ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
4079
4192
 
4080
- subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
4193
+ subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
4081
4194
  subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
4082
4195
  subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
4083
4196
  pipeline->layout,
@@ -4110,7 +4223,7 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
4110
4223
  ggml_vk_ctx_end(subctx);
4111
4224
  }
4112
4225
 
4113
- subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->q) });
4226
+ subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
4114
4227
  subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
4115
4228
  }
4116
4229
 
@@ -4311,7 +4424,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
4311
4424
  memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
4312
4425
  }
4313
4426
  } else {
4314
- vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
4427
+ std::lock_guard<std::mutex> guard(dst->device->mutex);
4428
+
4429
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
4315
4430
  ggml_vk_ctx_begin(dst->device, subctx);
4316
4431
  ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
4317
4432
  ggml_vk_ctx_end(subctx);
@@ -4323,6 +4438,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
4323
4438
  ggml_vk_submit(subctx, dst->device->fence);
4324
4439
  VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
4325
4440
  dst->device->device.resetFences({ dst->device->fence });
4441
+ ggml_vk_queue_command_pools_cleanup(dst->device);
4326
4442
  }
4327
4443
  }
4328
4444
 
@@ -4399,7 +4515,9 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
4399
4515
 
4400
4516
  memcpy(dst, (uint8_t *) src->ptr + offset, size);
4401
4517
  } else {
4402
- vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
4518
+ std::lock_guard<std::mutex> guard(src->device->mutex);
4519
+
4520
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
4403
4521
  ggml_vk_ctx_begin(src->device, subctx);
4404
4522
  ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
4405
4523
  ggml_vk_ctx_end(subctx);
@@ -4407,6 +4525,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
4407
4525
  ggml_vk_submit(subctx, src->device->fence);
4408
4526
  VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
4409
4527
  src->device->device.resetFences({ src->device->fence });
4528
+ ggml_vk_queue_command_pools_cleanup(src->device);
4410
4529
 
4411
4530
  for (auto& cpy : subctx->out_memcpys) {
4412
4531
  memcpy(cpy.dst, cpy.src, cpy.n);
@@ -4426,15 +4545,17 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
4426
4545
 
4427
4546
  static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
4428
4547
  if (src->device == dst->device) {
4548
+ std::lock_guard<std::mutex> guard(src->device->mutex);
4429
4549
  VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
4430
4550
  // Copy within the device
4431
- vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
4551
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
4432
4552
  ggml_vk_ctx_begin(src->device, subctx);
4433
4553
  ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
4434
4554
  ggml_vk_ctx_end(subctx);
4435
4555
  ggml_vk_submit(subctx, src->device->fence);
4436
4556
  VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
4437
4557
  src->device->device.resetFences({ src->device->fence });
4558
+ ggml_vk_queue_command_pools_cleanup(src->device);
4438
4559
  } else {
4439
4560
  VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
4440
4561
  // Copy device to device
@@ -4459,7 +4580,8 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
4459
4580
  static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
4460
4581
  VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
4461
4582
 
4462
- vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
4583
+ std::lock_guard<std::mutex> guard(dst->device->mutex);
4584
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
4463
4585
  ggml_vk_ctx_begin(dst->device, subctx);
4464
4586
  subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
4465
4587
  ggml_vk_ctx_end(subctx);
@@ -4467,6 +4589,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
4467
4589
  ggml_vk_submit(subctx, dst->device->fence);
4468
4590
  VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
4469
4591
  dst->device->device.resetFences({ dst->device->fence });
4592
+ ggml_vk_queue_command_pools_cleanup(dst->device);
4470
4593
  }
4471
4594
 
4472
4595
  static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
@@ -4540,7 +4663,7 @@ static void ggml_vk_matmul(
4540
4663
  ggml_vk_sync_buffers(subctx);
4541
4664
  if (split_k == 1) {
4542
4665
  const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
4543
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch });
4666
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
4544
4667
  return;
4545
4668
  }
4546
4669
 
@@ -4548,10 +4671,10 @@ static void ggml_vk_matmul(
4548
4671
 
4549
4672
  const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
4550
4673
  // Make sure enough workgroups get assigned for split k to work
4551
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
4674
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
4552
4675
  ggml_vk_sync_buffers(subctx);
4553
4676
  const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
4554
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
4677
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
4555
4678
  }
4556
4679
 
4557
4680
  static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) {
@@ -4599,7 +4722,7 @@ static void ggml_vk_matmul_id(
4599
4722
  ggml_vk_sync_buffers(subctx);
4600
4723
  const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
4601
4724
  nei0, nei1, nbi1, ne11, padded_n };
4602
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as });
4725
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as });
4603
4726
  }
4604
4727
 
4605
4728
  static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
@@ -4720,7 +4843,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
4720
4843
  };
4721
4844
  init_pushconst_fastdiv(pc);
4722
4845
  ggml_vk_sync_buffers(subctx);
4723
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
4846
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
4724
4847
  }
4725
4848
 
4726
4849
  static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
@@ -4739,7 +4862,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
4739
4862
  vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
4740
4863
 
4741
4864
  ggml_vk_sync_buffers(subctx);
4742
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(uint32_t), &ne, { ne, 1, 1 });
4865
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 1>{ne}, { ne, 1, 1 });
4743
4866
  }
4744
4867
 
4745
4868
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -4880,18 +5003,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
4880
5003
  }
4881
5004
 
4882
5005
  // Request descriptor sets
4883
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
5006
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
4884
5007
  if (qx_needs_dequant) {
4885
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
5008
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
4886
5009
  }
4887
5010
  if (qy_needs_dequant) {
4888
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
5011
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
4889
5012
  }
4890
5013
  if (quantize_y) {
4891
- ggml_pipeline_request_descriptor_sets(ctx->device, to_q8_1, 1);
5014
+ ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
4892
5015
  }
4893
5016
  if (split_k > 1) {
4894
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
5017
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
4895
5018
  }
4896
5019
  return;
4897
5020
  }
@@ -4939,7 +5062,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
4939
5062
  } else if (qx_needs_dequant) {
4940
5063
  const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
4941
5064
  ggml_vk_sync_buffers(subctx);
4942
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
5065
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
4943
5066
  }
4944
5067
  if (y_non_contig) {
4945
5068
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -5073,12 +5196,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5073
5196
 
5074
5197
  // Request descriptor sets
5075
5198
  if (qx_needs_dequant) {
5076
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
5199
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
5077
5200
  }
5078
5201
  if (qy_needs_dequant) {
5079
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
5202
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
5080
5203
  }
5081
- ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
5204
+ ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
5082
5205
  return;
5083
5206
  }
5084
5207
 
@@ -5155,7 +5278,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5155
5278
  ggml_vk_sync_buffers(subctx);
5156
5279
  ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
5157
5280
  { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
5158
- sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
5281
+ pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
5159
5282
  }
5160
5283
 
5161
5284
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5211,7 +5334,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
5211
5334
 
5212
5335
  if (dryrun) {
5213
5336
  // Request descriptor sets
5214
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
5337
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
5215
5338
  return;
5216
5339
  }
5217
5340
 
@@ -5243,7 +5366,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
5243
5366
  }
5244
5367
 
5245
5368
  ggml_vk_sync_buffers(subctx);
5246
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, workgroups_z });
5369
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z });
5247
5370
  }
5248
5371
 
5249
5372
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5300,7 +5423,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
5300
5423
 
5301
5424
  if (dryrun) {
5302
5425
  // Request descriptor sets
5303
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
5426
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
5304
5427
  return;
5305
5428
  }
5306
5429
 
@@ -5326,7 +5449,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
5326
5449
  const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
5327
5450
  ggml_vk_sync_buffers(subctx);
5328
5451
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
5329
- { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
5452
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
5330
5453
  }
5331
5454
 
5332
5455
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5487,12 +5610,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
5487
5610
  }
5488
5611
 
5489
5612
  // Request descriptor sets
5490
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
5613
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
5491
5614
  if (qx_needs_dequant) {
5492
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
5615
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
5493
5616
  }
5494
5617
  if (qy_needs_dequant) {
5495
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
5618
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
5496
5619
  }
5497
5620
  return;
5498
5621
  }
@@ -5542,7 +5665,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
5542
5665
  const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
5543
5666
  ggml_vk_sync_buffers(subctx);
5544
5667
  ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
5545
- { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
5668
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
5546
5669
  }
5547
5670
  if (y_non_contig) {
5548
5671
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -5681,12 +5804,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
5681
5804
 
5682
5805
  // Request descriptor sets
5683
5806
  if (qx_needs_dequant) {
5684
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
5807
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
5685
5808
  }
5686
5809
  if (qy_needs_dequant) {
5687
- ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
5810
+ ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
5688
5811
  }
5689
- ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
5812
+ ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
5690
5813
  return;
5691
5814
  }
5692
5815
 
@@ -5762,7 +5885,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
5762
5885
  ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
5763
5886
  { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
5764
5887
  vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
5765
- sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
5888
+ pc, { groups_x, (uint32_t)nei0, groups_z });
5766
5889
  }
5767
5890
 
5768
5891
  static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
@@ -6006,9 +6129,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6006
6129
 
6007
6130
  if (dryrun) {
6008
6131
  // Request descriptor sets
6009
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
6132
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
6010
6133
  if (split_k > 1) {
6011
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
6134
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
6012
6135
  }
6013
6136
  return;
6014
6137
  }
@@ -6112,7 +6235,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6112
6235
  // there's no more than one tile of rows (i.e. workgroups_x would have been
6113
6236
  // one). We reuse workgroups_x to mean the number of splits, so we need to
6114
6237
  // cancel out the divide by wg_denoms[0].
6115
- sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
6238
+ pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
6116
6239
 
6117
6240
  ggml_vk_sync_buffers(subctx);
6118
6241
  const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
@@ -6121,7 +6244,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6121
6244
  vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
6122
6245
  vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
6123
6246
  },
6124
- pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 });
6247
+ pc2, { (uint32_t)ne1, 1, 1 });
6125
6248
  } else {
6126
6249
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
6127
6250
  {
@@ -6131,7 +6254,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6131
6254
  vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
6132
6255
  vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
6133
6256
  },
6134
- sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
6257
+ pc, { workgroups_x, workgroups_y, workgroups_z });
6135
6258
  }
6136
6259
  }
6137
6260
 
@@ -6392,6 +6515,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
6392
6515
  return ctx->device->pipeline_timestep_embedding_f32;
6393
6516
  }
6394
6517
  return nullptr;
6518
+ case GGML_OP_CONV_TRANSPOSE_1D:
6519
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6520
+ return ctx->device->pipeline_conv_transpose_1d_f32;
6521
+ }
6522
+ return nullptr;
6395
6523
  case GGML_OP_POOL_2D:
6396
6524
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6397
6525
  return ctx->device->pipeline_pool2d_f32;
@@ -6566,7 +6694,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6566
6694
  }
6567
6695
 
6568
6696
  if (dryrun) {
6569
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
6697
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
6570
6698
  return;
6571
6699
  }
6572
6700
 
@@ -6726,6 +6854,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6726
6854
  uint32_t half_ceil = (dim + 1) / 2;
6727
6855
  elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
6728
6856
  } break;
6857
+ case GGML_OP_CONV_TRANSPOSE_1D:
6858
+ {
6859
+ elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
6860
+ } break;
6729
6861
  case GGML_OP_POOL_2D:
6730
6862
  {
6731
6863
  const uint32_t N = dst->ne[3];
@@ -6800,7 +6932,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6800
6932
  }
6801
6933
 
6802
6934
  ggml_vk_sync_buffers(subctx);
6803
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6935
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6804
6936
  } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
6805
6937
  // Empty src2 is possible in rope, but the shader needs a buffer
6806
6938
  vk_subbuffer subbuf_z;
@@ -6811,26 +6943,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6811
6943
  }
6812
6944
 
6813
6945
  ggml_vk_sync_buffers(subctx);
6814
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6946
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6815
6947
  } else if (op == GGML_OP_IM2COL) {
6816
6948
  // im2col uses only src1 and dst buffers
6817
6949
  ggml_vk_sync_buffers(subctx);
6818
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6950
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6819
6951
  } else if (op == GGML_OP_COUNT_EQUAL) {
6820
6952
  ggml_vk_sync_buffers(subctx);
6821
6953
  // count_equal assumes that destination buffer is initialized with zeroes
6822
6954
  ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
6823
6955
  ggml_vk_sync_buffers(subctx);
6824
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6956
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6825
6957
  } else if (use_src2) {
6826
6958
  ggml_vk_sync_buffers(subctx);
6827
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6959
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6828
6960
  } else if (use_src1) {
6829
6961
  ggml_vk_sync_buffers(subctx);
6830
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6962
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6831
6963
  } else {
6832
6964
  ggml_vk_sync_buffers(subctx);
6833
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
6965
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
6834
6966
  }
6835
6967
  }
6836
6968
 
@@ -6943,7 +7075,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
6943
7075
  GGML_ASSERT(pipeline != nullptr);
6944
7076
 
6945
7077
  if (dryrun) {
6946
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
7078
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
6947
7079
  return;
6948
7080
  }
6949
7081
 
@@ -6999,7 +7131,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
6999
7131
  vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
7000
7132
  vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
7001
7133
  vk_subbuffer{ d_D, dst_offset, dst_size }
7002
- }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements);
7134
+ }, pc, elements);
7003
7135
  } else if (version == 7) {
7004
7136
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
7005
7137
  vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
@@ -7010,7 +7142,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
7010
7142
  vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
7011
7143
  vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
7012
7144
  vk_subbuffer{ d_D, dst_offset, dst_size }
7013
- }, sizeof(vk_op_rwkv_wkv7_push_constants), &pc, elements);
7145
+ }, pc, elements);
7014
7146
  } else {
7015
7147
  // shouldn't happen
7016
7148
  GGML_ASSERT(false);
@@ -7082,7 +7214,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
7082
7214
  GGML_ASSERT(pipeline != nullptr);
7083
7215
 
7084
7216
  if (dryrun) {
7085
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
7217
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
7086
7218
  return;
7087
7219
  }
7088
7220
 
@@ -7147,7 +7279,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
7147
7279
  vk_subbuffer{ d_GM, gm_offset, gm_size },
7148
7280
  vk_subbuffer{ d_GV, gv_offset, gv_size },
7149
7281
  vk_subbuffer{ d_P, p_offset, p_size },
7150
- }, sizeof(vk_op_push_constants), &pc, elements);
7282
+ }, pc, elements);
7151
7283
  }
7152
7284
 
7153
7285
  static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
@@ -7529,6 +7661,37 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
7529
7661
  }, dryrun);
7530
7662
  }
7531
7663
 
7664
+ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
7665
+ // src0: (K, Cout, Cin, 1) -- kernel
7666
+ // src1: (L, Cin, 1, 1) -- input
7667
+ // dst: (*, Cout, 1, 1)
7668
+
7669
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7670
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7671
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7672
+
7673
+ GGML_TENSOR_BINARY_OP_LOCALS
7674
+
7675
+ GGML_ASSERT(nb00 == sizeof(float));
7676
+ GGML_ASSERT(nb10 == sizeof(float));
7677
+
7678
+ const int32_t s0 = dst->op_params[0];
7679
+
7680
+ vk_op_conv_transpose_1d_push_constants p{};
7681
+ p.Cout = static_cast<uint32_t>(ne01);
7682
+ p.Cin = static_cast<uint32_t>(ne02);
7683
+ p.K = static_cast<uint32_t>(ne00);
7684
+ p.L = static_cast<uint32_t>(ne10);
7685
+ p.KL = static_cast<uint32_t>(ne0);
7686
+ p.nb01 = static_cast<uint32_t>(nb01 / nb00);
7687
+ p.nb02 = static_cast<uint32_t>(nb02 / nb00);
7688
+ p.nb11 = static_cast<uint32_t>(nb11 / nb10);
7689
+ p.nb1 = static_cast<uint32_t>(nb1 / nb0);
7690
+ p.s0 = static_cast<uint32_t>(s0);
7691
+
7692
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun);
7693
+ }
7694
+
7532
7695
  static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7533
7696
  uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
7534
7697
  const int32_t k1 = dst->op_params[1];
@@ -7729,9 +7892,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
7729
7892
  }
7730
7893
  }
7731
7894
 
7732
- ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
7895
+ ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
7733
7896
  if (split_k > 1) {
7734
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
7897
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
7735
7898
 
7736
7899
  if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
7737
7900
  // Resize buffer
@@ -7746,7 +7909,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
7746
7909
  ggml_vk_load_shaders(ctx->device);
7747
7910
  }
7748
7911
 
7749
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
7912
+ ggml_pipeline_allocate_descriptor_sets(ctx);
7750
7913
 
7751
7914
  vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
7752
7915
  vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
@@ -7788,7 +7951,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
7788
7951
  ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
7789
7952
  ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
7790
7953
 
7791
- vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
7954
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
7792
7955
  ggml_vk_ctx_begin(ctx->device, subctx);
7793
7956
  for (size_t i = 0; i < num_it; i++) {
7794
7957
  ggml_vk_matmul(
@@ -7804,6 +7967,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
7804
7967
  ggml_vk_submit(subctx, ctx->fence);
7805
7968
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
7806
7969
  ctx->device->device.resetFences({ ctx->fence });
7970
+ ggml_vk_queue_command_pools_cleanup(ctx->device);
7807
7971
 
7808
7972
  auto end = std::chrono::high_resolution_clock::now();
7809
7973
  double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
@@ -7905,16 +8069,13 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
7905
8069
 
7906
8070
  free(d_chk);
7907
8071
 
7908
- ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
7909
- ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
8072
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
8073
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
7910
8074
 
7911
8075
  ggml_vk_destroy_buffer(d_X);
7912
8076
  ggml_vk_destroy_buffer(d_Y);
7913
8077
  ggml_vk_destroy_buffer(d_D);
7914
8078
 
7915
- ggml_pipeline_cleanup(p);
7916
- ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
7917
-
7918
8079
  free(x);
7919
8080
  free(y);
7920
8081
  free(d);
@@ -7992,20 +8153,20 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
7992
8153
  ggml_vk_quantize_data(x, qx, ne, quant);
7993
8154
  ggml_vk_dequantize_data(qx, x_ref, ne, quant);
7994
8155
 
7995
- ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
8156
+ ggml_pipeline_request_descriptor_sets(ctx, p, 1);
7996
8157
 
7997
8158
  if (ctx->device->need_compiles) {
7998
8159
  ggml_vk_load_shaders(ctx->device);
7999
8160
  }
8000
8161
 
8001
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
8162
+ ggml_pipeline_allocate_descriptor_sets(ctx);
8002
8163
 
8003
8164
  ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
8004
8165
 
8005
- vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
8166
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
8006
8167
  ggml_vk_ctx_begin(ctx->device, subctx);
8007
8168
  const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
8008
- ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
8169
+ ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
8009
8170
  ggml_vk_ctx_end(subctx);
8010
8171
 
8011
8172
  auto begin = std::chrono::high_resolution_clock::now();
@@ -8013,6 +8174,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
8013
8174
  ggml_vk_submit(subctx, ctx->fence);
8014
8175
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
8015
8176
  ctx->device->device.resetFences({ ctx->fence });
8177
+ ggml_vk_queue_command_pools_cleanup(ctx->device);
8016
8178
 
8017
8179
  auto end = std::chrono::high_resolution_clock::now();
8018
8180
 
@@ -8092,17 +8254,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
8092
8254
  //
8093
8255
  // vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
8094
8256
  //
8095
- // ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
8257
+ // ggml_pipeline_request_descriptor_sets(ctx, p, 1);
8096
8258
  //
8097
8259
  // if (ctx->device->need_compiles) {
8098
8260
  // ggml_vk_load_shaders(ctx->device);
8099
8261
  // }
8100
8262
  //
8101
- // ggml_pipeline_allocate_descriptor_sets(ctx->device);
8263
+ // ggml_pipeline_allocate_descriptor_sets(ctx);
8102
8264
  //
8103
8265
  // ggml_vk_buffer_write(x_buf, 0, x, x_sz);
8104
8266
  //
8105
- // vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
8267
+ // vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
8106
8268
  // ggml_vk_ctx_begin(ctx->device, subctx);
8107
8269
  // ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
8108
8270
  // ggml_vk_ctx_end(subctx);
@@ -8112,6 +8274,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
8112
8274
  // ggml_vk_submit(subctx, ctx->fence);
8113
8275
  // VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
8114
8276
  // ctx->device->device.resetFences({ ctx->fence });
8277
+ // ggml_vk_queue_command_pools_cleanup(ctx->device);
8115
8278
  //
8116
8279
  // auto end = std::chrono::high_resolution_clock::now();
8117
8280
  //
@@ -8251,9 +8414,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
8251
8414
  // y[i] = i % k;
8252
8415
  }
8253
8416
 
8254
- ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
8417
+ ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
8255
8418
  if (split_k > 1) {
8256
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
8419
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
8257
8420
 
8258
8421
  if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
8259
8422
  // Resize buffer
@@ -8264,19 +8427,19 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
8264
8427
  }
8265
8428
  }
8266
8429
  if (mmq) {
8267
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_quantize_q8_1, num_it);
8430
+ ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
8268
8431
  }
8269
8432
 
8270
8433
  if (ctx->device->need_compiles) {
8271
8434
  ggml_vk_load_shaders(ctx->device);
8272
8435
  }
8273
8436
 
8274
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
8437
+ ggml_pipeline_allocate_descriptor_sets(ctx);
8275
8438
 
8276
8439
  ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
8277
8440
  ggml_vk_buffer_write(y_buf, 0, y, y_sz);
8278
8441
 
8279
- vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
8442
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
8280
8443
  ggml_vk_ctx_begin(ctx->device, subctx);
8281
8444
  if (mmq) {
8282
8445
  for (size_t i = 0; i < num_it; i++) {
@@ -8305,6 +8468,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
8305
8468
  ggml_vk_submit(subctx, ctx->fence);
8306
8469
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
8307
8470
  ctx->device->device.resetFences({ ctx->fence });
8471
+ ggml_vk_queue_command_pools_cleanup(ctx->device);
8308
8472
 
8309
8473
  auto end = std::chrono::high_resolution_clock::now();
8310
8474
 
@@ -8600,6 +8764,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8600
8764
  case GGML_OP_COUNT_EQUAL:
8601
8765
  case GGML_OP_IM2COL:
8602
8766
  case GGML_OP_TIMESTEP_EMBEDDING:
8767
+ case GGML_OP_CONV_TRANSPOSE_1D:
8603
8768
  case GGML_OP_POOL_2D:
8604
8769
  case GGML_OP_CONV_2D_DW:
8605
8770
  case GGML_OP_RWKV_WKV6:
@@ -8618,7 +8783,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8618
8783
 
8619
8784
  if (!dryrun) {
8620
8785
  if (ctx->compute_ctx.expired()) {
8621
- compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
8786
+ compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
8622
8787
  ctx->compute_ctx = compute_ctx;
8623
8788
  ggml_vk_ctx_begin(ctx->device, compute_ctx);
8624
8789
  } else {
@@ -8664,6 +8829,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8664
8829
  case GGML_OP_COUNT_EQUAL:
8665
8830
  case GGML_OP_IM2COL:
8666
8831
  case GGML_OP_TIMESTEP_EMBEDDING:
8832
+ case GGML_OP_CONV_TRANSPOSE_1D:
8667
8833
  case GGML_OP_POOL_2D:
8668
8834
  case GGML_OP_CONV_2D_DW:
8669
8835
  case GGML_OP_LEAKY_RELU:
@@ -8671,7 +8837,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8671
8837
  // These operations all go through ggml_vk_op_f32, so short-circuit and
8672
8838
  // do the only thing needed for the dryrun.
8673
8839
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
8674
- ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
8840
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
8675
8841
  return false;
8676
8842
  }
8677
8843
  default:
@@ -8835,6 +9001,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
8835
9001
  case GGML_OP_TIMESTEP_EMBEDDING:
8836
9002
  ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
8837
9003
 
9004
+ break;
9005
+ case GGML_OP_CONV_TRANSPOSE_1D:
9006
+ ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun);
9007
+
8838
9008
  break;
8839
9009
  case GGML_OP_POOL_2D:
8840
9010
  ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
@@ -8963,6 +9133,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
8963
9133
  case GGML_OP_COUNT_EQUAL:
8964
9134
  case GGML_OP_IM2COL:
8965
9135
  case GGML_OP_TIMESTEP_EMBEDDING:
9136
+ case GGML_OP_CONV_TRANSPOSE_1D:
8966
9137
  case GGML_OP_POOL_2D:
8967
9138
  case GGML_OP_CONV_2D_DW:
8968
9139
  case GGML_OP_RWKV_WKV6:
@@ -9058,19 +9229,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
9058
9229
  }
9059
9230
  ctx->gc.temp_buffers.clear();
9060
9231
 
9061
- for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) {
9062
- vk_pipeline_ref plr = ctx->device->pipelines[dsr.first];
9063
-
9064
- if (plr.expired()) {
9065
- continue;
9066
- }
9067
-
9068
- vk_pipeline pl = plr.lock();
9069
- ggml_pipeline_cleanup(pl);
9070
- }
9071
-
9072
- ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
9073
- ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
9232
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
9233
+ ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
9074
9234
 
9075
9235
  for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
9076
9236
  ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
@@ -9091,7 +9251,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
9091
9251
 
9092
9252
  ctx->tensor_ctxs.clear();
9093
9253
  ctx->gc.contexts.clear();
9094
- ctx->device->pipeline_descriptor_set_requirements.clear();
9254
+ ctx->pipeline_descriptor_set_requirements = 0;
9255
+ ctx->descriptor_set_idx = 0;
9095
9256
  }
9096
9257
 
9097
9258
  // Clean up on backend free
@@ -9118,6 +9279,15 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
9118
9279
 
9119
9280
  ctx->device->device.destroyFence(ctx->fence);
9120
9281
  ctx->device->device.destroyFence(ctx->almost_ready_fence);
9282
+
9283
+ for (auto& pool : ctx->descriptor_pools) {
9284
+ ctx->device->device.destroyDescriptorPool(pool);
9285
+ }
9286
+ ctx->descriptor_pools.clear();
9287
+ ctx->descriptor_sets.clear();
9288
+
9289
+ ctx->compute_cmd_pool.destroy(ctx->device->device);
9290
+ ctx->transfer_cmd_pool.destroy(ctx->device->device);
9121
9291
  }
9122
9292
 
9123
9293
  static int ggml_vk_get_device_count() {
@@ -9325,6 +9495,12 @@ static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer
9325
9495
  UNUSED(buft);
9326
9496
  }
9327
9497
 
9498
+ static size_t ggml_backend_vk_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
9499
+ return vk_instance.devices[0]->suballocation_block_size;
9500
+
9501
+ UNUSED(buft);
9502
+ }
9503
+
9328
9504
  // Should be changed to return device-specific host buffer type
9329
9505
  // but that probably requires changes in llama.cpp
9330
9506
  ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
@@ -9333,7 +9509,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
9333
9509
  /* .get_name = */ ggml_backend_vk_host_buffer_type_name,
9334
9510
  /* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer,
9335
9511
  /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
9336
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
9512
+ /* .get_max_size = */ ggml_backend_vk_host_buffer_type_get_max_size,
9337
9513
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9338
9514
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
9339
9515
  },
@@ -9384,7 +9560,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
9384
9560
 
9385
9561
  if (ctx->transfer_ctx.expired()) {
9386
9562
  // Initialize new transfer context
9387
- transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
9563
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
9388
9564
  ctx->transfer_ctx = transfer_ctx;
9389
9565
  ggml_vk_ctx_begin(ctx->device, transfer_ctx);
9390
9566
  } else {
@@ -9407,7 +9583,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
9407
9583
 
9408
9584
  if (ctx->transfer_ctx.expired()) {
9409
9585
  // Initialize new transfer context
9410
- transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
9586
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
9411
9587
  ctx->transfer_ctx = transfer_ctx;
9412
9588
  ggml_vk_ctx_begin(ctx->device, transfer_ctx);
9413
9589
  } else {
@@ -9430,7 +9606,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
9430
9606
 
9431
9607
  if (ctx->transfer_ctx.expired()) {
9432
9608
  // Initialize new transfer context
9433
- transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
9609
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
9434
9610
  ctx->transfer_ctx = transfer_ctx;
9435
9611
  ggml_vk_ctx_begin(ctx->device, transfer_ctx);
9436
9612
  } else {
@@ -9491,7 +9667,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9491
9667
  ggml_vk_load_shaders(ctx->device);
9492
9668
  }
9493
9669
  ggml_vk_preallocate_buffers(ctx);
9494
- ggml_pipeline_allocate_descriptor_sets(ctx->device);
9670
+ ggml_pipeline_allocate_descriptor_sets(ctx);
9495
9671
 
9496
9672
  int last_node = cgraph->n_nodes - 1;
9497
9673
 
@@ -9513,8 +9689,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9513
9689
  if (ctx->device->query_pool) {
9514
9690
  ctx->device->device.destroyQueryPool(ctx->device->query_pool);
9515
9691
  }
9516
- VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
9517
- query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
9692
+ vk::QueryPoolCreateInfo query_create_info;
9693
+ query_create_info.queryType = vk::QueryType::eTimestamp;
9518
9694
  query_create_info.queryCount = cgraph->n_nodes + 100;
9519
9695
  ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
9520
9696
  ctx->device->num_queries = query_create_info.queryCount;
@@ -9523,7 +9699,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9523
9699
  ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
9524
9700
 
9525
9701
  GGML_ASSERT(ctx->compute_ctx.expired());
9526
- compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
9702
+ compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
9527
9703
  ctx->compute_ctx = compute_ctx;
9528
9704
  ggml_vk_ctx_begin(ctx->device, compute_ctx);
9529
9705
  compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
@@ -9558,7 +9734,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9558
9734
 
9559
9735
  if (vk_perf_logger_enabled) {
9560
9736
  if (ctx->compute_ctx.expired()) {
9561
- compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
9737
+ compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
9562
9738
  ctx->compute_ctx = compute_ctx;
9563
9739
  ggml_vk_ctx_begin(ctx->device, compute_ctx);
9564
9740
  } else {
@@ -9600,7 +9776,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
9600
9776
 
9601
9777
  // Get the results and pass them to the logger
9602
9778
  std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
9603
- ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
9779
+ VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
9604
9780
  for (int i = 0; i < cgraph->n_nodes; i++) {
9605
9781
  if (!ggml_vk_is_empty(cgraph->nodes[i])) {
9606
9782
  ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
@@ -10024,6 +10200,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
10024
10200
  case GGML_OP_LEAKY_RELU:
10025
10201
  case GGML_OP_OPT_STEP_ADAMW:
10026
10202
  return true;
10203
+ case GGML_OP_CONV_TRANSPOSE_1D:
10204
+ return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
10027
10205
  default:
10028
10206
  return false;
10029
10207
  }
@@ -10170,8 +10348,9 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
10170
10348
  static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
10171
10349
  switch (props.vendorID) {
10172
10350
  case VK_VENDOR_ID_INTEL:
10173
- // Intel drivers don't support coopmat properly yet
10174
- return false;
10351
+ // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost,
10352
+ // while some older hardware (ex. Arc A770) has performance regressions
10353
+ return arch == vk_device_architecture::INTEL_XE2;
10175
10354
  case VK_VENDOR_ID_AMD:
10176
10355
  if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
10177
10356
  // Workaround for AMD proprietary driver reporting support on all GPUs
@@ -10515,6 +10694,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
10515
10694
  const int32_t dim = tensor->op_params[0];
10516
10695
  const int32_t max_period = tensor->op_params[1];
10517
10696
  tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
10697
+ } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
10698
+ const int32_t s0 = tensor->op_params[0];
10699
+ const int32_t p0 = tensor->op_params[1];
10700
+ const int32_t d0 = tensor->op_params[2];
10701
+ tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
10518
10702
  } else if (tensor->op == GGML_OP_POOL_2D) {
10519
10703
  enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
10520
10704
  const int32_t k0 = tensor->op_params[1];