@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,6 +1,6 @@
1
1
  #include "ggml-vulkan.h"
2
2
  #include <vulkan/vulkan_core.h>
3
- #ifdef GGML_VULKAN_RUN_TESTS
3
+ #if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF)
4
4
  #include <chrono>
5
5
  #endif
6
6
 
@@ -17,10 +17,13 @@
17
17
  #include <memory>
18
18
  #include <limits>
19
19
  #include <map>
20
+ #include <unordered_map>
20
21
  #include <memory>
21
22
  #include <mutex>
23
+ #include <future>
24
+ #include <thread>
22
25
 
23
- #include "ggml.h"
26
+ #include "ggml-impl.h"
24
27
  #include "ggml-backend-impl.h"
25
28
 
26
29
  #include "ggml-vulkan-shaders.hpp"
@@ -34,9 +37,7 @@
34
37
  #define VK_VENDOR_ID_INTEL 0x8086
35
38
  #define VK_VENDOR_ID_NVIDIA 0x10de
36
39
 
37
- #define VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN 0
38
- #define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
39
- #define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
40
+ #define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32
40
41
 
41
42
  #define GGML_VK_MAX_NODES 8192
42
43
 
@@ -74,6 +75,8 @@ struct vk_queue {
74
75
  std::vector<vk::CommandBuffer> cmd_buffers;
75
76
 
76
77
  vk::PipelineStageFlags stage_flags;
78
+
79
+ bool transfer_only;
77
80
  };
78
81
 
79
82
  struct vk_pipeline_struct {
@@ -116,11 +119,11 @@ struct ggml_backend_vk_buffer_type_context {
116
119
  vk_device device;
117
120
  };
118
121
 
119
- GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
120
- GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
121
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
122
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft);
123
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor);
122
+ static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
123
+ static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
124
+ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
125
+ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft);
126
+ static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor);
124
127
  static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
125
128
  /* .get_name = */ ggml_backend_vk_buffer_type_name,
126
129
  /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
@@ -133,6 +136,9 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
133
136
  #ifdef GGML_VULKAN_MEMORY_DEBUG
134
137
  class vk_memory_logger;
135
138
  #endif
139
+ #ifdef GGML_VULKAN_PERF
140
+ class vk_perf_logger;
141
+ #endif
136
142
  static void ggml_vk_destroy_buffer(vk_buffer& buf);
137
143
 
138
144
  struct vk_device_struct {
@@ -148,7 +154,6 @@ struct vk_device_struct {
148
154
  vk_queue compute_queue;
149
155
  vk_queue transfer_queue;
150
156
  bool single_queue;
151
- uint32_t descriptor_set_mode;
152
157
  uint32_t subgroup_size;
153
158
  bool uma;
154
159
 
@@ -177,26 +182,40 @@ struct vk_device_struct {
177
182
  vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
178
183
  vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
179
184
  vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
185
+ vk_pipeline pipeline_acc_f32;
186
+ vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16;
180
187
  vk_pipeline pipeline_mul_f32;
181
188
  vk_pipeline pipeline_div_f32;
182
- vk_pipeline pipeline_add_f32;
189
+ vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
190
+ vk_pipeline pipeline_upscale_f32;
183
191
  vk_pipeline pipeline_scale_f32;
184
192
  vk_pipeline pipeline_sqr_f32;
193
+ vk_pipeline pipeline_sin_f32;
194
+ vk_pipeline pipeline_cos_f32;
185
195
  vk_pipeline pipeline_clamp_f32;
196
+ vk_pipeline pipeline_pad_f32;
197
+ vk_pipeline pipeline_repeat_f32;
186
198
  vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
187
199
  vk_pipeline pipeline_norm_f32;
200
+ vk_pipeline pipeline_group_norm_f32;
188
201
  vk_pipeline pipeline_rms_norm_f32;
189
202
  vk_pipeline pipeline_gelu_f32;
203
+ vk_pipeline pipeline_gelu_quick_f32;
190
204
  vk_pipeline pipeline_silu_f32;
191
205
  vk_pipeline pipeline_relu_f32;
206
+ vk_pipeline pipeline_leaky_relu_f32;
207
+ vk_pipeline pipeline_tanh_f32;
192
208
  vk_pipeline pipeline_diag_mask_inf_f32;
193
209
  vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
194
210
  vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
195
211
  vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
196
212
  vk_pipeline pipeline_argsort_f32;
197
213
  vk_pipeline pipeline_sum_rows_f32;
214
+ vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
215
+ vk_pipeline pipeline_timestep_embedding_f32;
198
216
 
199
- std::vector<vk_pipeline_ref> pipelines;
217
+ std::unordered_map<std::string, vk_pipeline_ref> pipelines;
218
+ std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
200
219
 
201
220
  std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
202
221
 
@@ -208,6 +227,9 @@ struct vk_device_struct {
208
227
  #ifdef GGML_VULKAN_MEMORY_DEBUG
209
228
  std::unique_ptr<vk_memory_logger> memory_logger;
210
229
  #endif
230
+ #ifdef GGML_VULKAN_PERF
231
+ std::unique_ptr<vk_perf_logger> perf_logger;
232
+ #endif
211
233
 
212
234
  ~vk_device_struct() {
213
235
  VK_LOG_DEBUG("destroy device " << name);
@@ -222,11 +244,11 @@ struct vk_device_struct {
222
244
  }
223
245
 
224
246
  for (auto& pipeline : pipelines) {
225
- if (pipeline.expired()) {
247
+ if (pipeline.second.expired()) {
226
248
  continue;
227
249
  }
228
250
 
229
- vk_pipeline pl = pipeline.lock();
251
+ vk_pipeline pl = pipeline.second.lock();
230
252
  ggml_vk_destroy_pipeline(device, pl);
231
253
  }
232
254
  pipelines.clear();
@@ -259,6 +281,10 @@ struct vk_subbuffer {
259
281
  vk_buffer buffer;
260
282
  uint64_t offset;
261
283
  uint64_t size;
284
+
285
+ operator vk::DescriptorBufferInfo() const {
286
+ return { buffer->buffer, offset, size };
287
+ }
262
288
  };
263
289
 
264
290
  struct vk_semaphore {
@@ -320,7 +346,7 @@ struct vk_op_binary_push_constants {
320
346
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
321
347
  uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
322
348
  uint32_t d_offset;
323
- float param1; float param2;
349
+ float param1; float param2; int32_t param3;
324
350
  };
325
351
 
326
352
  struct vk_op_diag_mask_push_constants {
@@ -358,6 +384,25 @@ struct vk_op_argsort_push_constants {
358
384
  int32_t order;
359
385
  };
360
386
 
387
+ struct vk_op_im2col_push_constants {
388
+ uint32_t batch_offset; uint32_t offset_delta;
389
+ uint32_t IC;
390
+ uint32_t IW; uint32_t IH;
391
+ uint32_t OW; uint32_t OH;
392
+ uint32_t KW; uint32_t KH;
393
+ uint32_t pelements;
394
+ uint32_t CHW;
395
+ int32_t s0; int32_t s1;
396
+ int32_t p0; int32_t p1;
397
+ int32_t d0; int32_t d1;
398
+ };
399
+
400
+ struct vk_op_timestep_embedding_push_constants {
401
+ uint32_t nb1;
402
+ uint32_t dim;
403
+ uint32_t max_period;
404
+ };
405
+
361
406
  // Allow pre-recording command buffers
362
407
  struct vk_staging_memcpy {
363
408
  vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
@@ -367,32 +412,26 @@ struct vk_staging_memcpy {
367
412
  size_t n;
368
413
  };
369
414
 
370
- struct vk_context {
371
- size_t idx;
415
+ struct vk_op_upscale_push_constants {
416
+ uint32_t ne; uint32_t d_offset;
417
+ uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
418
+ uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
419
+ float sf0; float sf1; float sf2; float sf3;
420
+ };
372
421
 
422
+ struct vk_context_struct {
373
423
  vk_submission * s;
374
424
  std::vector<vk_sequence> seqs;
375
425
 
376
- ggml_tensor * exit_tensor;
426
+ int exit_tensor_idx;
377
427
 
378
428
  std::vector<vk_staging_memcpy> in_memcpys;
379
429
  std::vector<vk_staging_memcpy> out_memcpys;
380
430
 
381
431
  vk_queue * q;
382
432
  };
383
-
384
- struct ggml_tensor_extra_gpu {
385
- size_t ctx_idx;
386
-
387
- vk_buffer_ref buffer_gpu;
388
- uint64_t offset;
389
-
390
- void reset() {
391
- ctx_idx = 0;
392
- buffer_gpu.reset();
393
- offset = 0;
394
- }
395
- };
433
+ typedef std::shared_ptr<vk_context_struct> vk_context;
434
+ typedef std::weak_ptr<vk_context_struct> vk_context_ref;
396
435
 
397
436
  struct ggml_vk_garbage_collector {
398
437
  std::vector<vk_semaphore> tl_semaphores;
@@ -443,6 +482,48 @@ private:
443
482
  #define VK_LOG_MEMORY(msg) ((void) 0)
444
483
  #endif // GGML_VULKAN_MEMORY_DEBUG
445
484
 
485
+ #if defined(GGML_VULKAN_PERF)
486
+
487
+ class vk_perf_logger {
488
+ public:
489
+ void print_timings() {
490
+ std::cerr << "----------------\nVulkan Timings:" << std::endl;
491
+ for (const auto& t : timings) {
492
+ uint64_t total = 0;
493
+ for (const auto& time : t.second) {
494
+ total += time;
495
+ }
496
+ std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
497
+ }
498
+
499
+ timings.clear();
500
+ }
501
+
502
+ void log_timing(const ggml_tensor * node, uint64_t time) {
503
+ if (node->op == GGML_OP_UNARY) {
504
+ timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
505
+ return;
506
+ }
507
+ if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
508
+ const uint64_t m = node->src[0]->ne[1];
509
+ const uint64_t n = node->src[1]->ne[1];
510
+ const uint64_t k = node->src[1]->ne[0];
511
+ std::string name = ggml_op_name(node->op);
512
+ if (n == 1) {
513
+ name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k);
514
+ } else {
515
+ name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
516
+ }
517
+ timings[name].push_back(time);
518
+ return;
519
+ }
520
+ timings[ggml_op_name(node->op)].push_back(time);
521
+ }
522
+ private:
523
+ std::map<std::string, std::vector<uint64_t>> timings;
524
+ };
525
+ #endif // GGML_VULKAN_PERF
526
+
446
527
  struct ggml_backend_vk_context {
447
528
  std::string name;
448
529
 
@@ -453,14 +534,38 @@ struct ggml_backend_vk_context {
453
534
  size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
454
535
  vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
455
536
  vk::Fence fence;
456
- vk_buffer staging;
457
- size_t staging_size;
458
- size_t staging_offset;
459
537
 
460
538
  vk_buffer buffer_pool[MAX_VK_BUFFERS];
461
539
 
462
- vk_context * compute_ctx;
463
- vk_context * transfer_ctx;
540
+ vk_context_ref compute_ctx;
541
+ vk_context_ref transfer_ctx;
542
+
543
+ std::vector<vk_context_ref> tensor_ctxs;
544
+ };
545
+
546
+ static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
547
+
548
+ static uint64_t vk_tensor_offset(const ggml_tensor * tensor) {
549
+ if (tensor->view_src) {
550
+ return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base;
551
+ }
552
+ return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
553
+ }
554
+
555
+ struct ggml_backend_vk_buffer_context {
556
+ vk_device_ref device;
557
+ vk_buffer dev_buffer;
558
+ std::string name;
559
+
560
+ ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) :
561
+ device(device),
562
+ dev_buffer(dev_buffer),
563
+ name(name) {
564
+ }
565
+
566
+ ~ggml_backend_vk_buffer_context() {
567
+ ggml_vk_destroy_buffer(dev_buffer);
568
+ }
464
569
  };
465
570
 
466
571
  #ifdef GGML_VULKAN_MEMORY_DEBUG
@@ -510,22 +615,25 @@ static vk_instance_t vk_instance;
510
615
  static size_t vk_skip_checks;
511
616
  static size_t vk_output_tensor;
512
617
 
513
- static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name);
514
- static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor);
515
- static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor);
618
+ static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
619
+ static void ggml_vk_check_results_0(ggml_tensor * tensor);
620
+ static void ggml_vk_check_results_1(ggml_tensor * tensor);
516
621
  #endif
517
622
 
518
- typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
623
+ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
519
624
 
520
- GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
625
+ static void ggml_backend_vk_free(ggml_backend_t backend);
521
626
 
522
- static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
627
+ // variables to track number of compiles in progress
628
+ static uint32_t compile_count = 0;
629
+ static std::mutex compile_count_mutex;
630
+ static std::condition_variable compile_count_cond;
631
+
632
+ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align) {
523
633
  VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
524
634
  GGML_ASSERT(parameter_count > 0);
525
635
  GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
526
636
 
527
- std::lock_guard<std::mutex> guard(device->mutex);
528
-
529
637
  pipeline = std::make_shared<vk_pipeline_struct>();
530
638
  pipeline->name = name;
531
639
  pipeline->parameter_count = parameter_count;
@@ -557,35 +665,9 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
557
665
  descriptor_set_layout_create_info.setPNext(&dslbfci);
558
666
  pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
559
667
 
560
- // Check if device supports multiple descriptors per pool
561
- if (device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) {
562
- const uint32_t alloc_count = 2;
563
-
564
- // Try allocating multiple sets from one pool
565
- // This fails on AMD for some reason, so add a fall back to allocating one pool per set
566
- vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
567
- vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size);
568
- vk::DescriptorPool pool = device->device.createDescriptorPool(descriptor_pool_create_info);
569
-
570
- std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
571
- for (uint32_t i = 0; i < alloc_count; i++) {
572
- layouts[i] = pipeline->dsl;
573
- }
574
- try {
575
- vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data());
576
- std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
577
- } catch(vk::OutOfPoolMemoryError const&) {
578
- device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE;
579
- }
580
-
581
- device->device.destroyDescriptorPool(pool);
582
- }
583
-
584
- if (device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
585
- vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
586
- vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size);
587
- pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
588
- }
668
+ vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
669
+ vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
670
+ pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
589
671
 
590
672
  pipeline->descriptor_set_idx = 0;
591
673
 
@@ -619,7 +701,17 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
619
701
  pipeline->layout);
620
702
  pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
621
703
 
622
- device->pipelines.push_back(pipeline);
704
+ {
705
+ std::lock_guard<std::mutex> guard(device->mutex);
706
+ device->pipelines.insert({ pipeline->name, pipeline });
707
+ }
708
+
709
+ {
710
+ std::lock_guard<std::mutex> guard(compile_count_mutex);
711
+ assert(compile_count > 0);
712
+ compile_count--;
713
+ }
714
+ compile_count_cond.notify_all();
623
715
  }
624
716
 
625
717
  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
@@ -640,34 +732,49 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
640
732
  device.destroyPipeline(pipeline->pipeline);
641
733
  }
642
734
 
643
- static void ggml_pipeline_allocate_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
644
- VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
645
- if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
646
- // Enough descriptors are available
647
- return;
648
- }
735
+ static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
736
+ VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
737
+ device->pipeline_descriptor_set_requirements[pipeline->name] += n;
738
+ }
649
739
 
740
+ static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
650
741
  std::lock_guard<std::mutex> guard(device->mutex);
651
742
 
652
- if (device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
653
- const uint32_t alloc_count = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
743
+ for (auto& pair : device->pipeline_descriptor_set_requirements) {
744
+ vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
745
+ const uint64_t n = pair.second;
654
746
 
655
- std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
656
- for (uint32_t i = 0; i < alloc_count; i++) {
657
- layouts[i] = pipeline->dsl;
747
+ VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
748
+
749
+ if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
750
+ // Enough descriptors are available
751
+ continue;
658
752
  }
659
- vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[0], alloc_count, layouts.data());
660
- std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
661
- pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
662
- } else {
663
- for (uint32_t i = pipeline->descriptor_sets.size(); i < pipeline->descriptor_set_idx + n; i++) {
664
- vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
665
- vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size);
666
- pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
667
753
 
668
- vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[i], 1, &pipeline->dsl);
754
+ uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
755
+ uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
756
+ uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
757
+
758
+ while (to_alloc > 0) {
759
+ const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
760
+ to_alloc -= alloc_count;
761
+ pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
762
+
763
+ if (pool_idx >= pipeline->descriptor_pools.size()) {
764
+ vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
765
+ vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
766
+ pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
767
+ }
768
+
769
+ std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
770
+ for (uint32_t i = 0; i < alloc_count; i++) {
771
+ layouts[i] = pipeline->dsl;
772
+ }
773
+ vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data());
669
774
  std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
670
- pipeline->descriptor_sets.push_back(sets[0]);
775
+ pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
776
+
777
+ pool_idx++;
671
778
  }
672
779
  }
673
780
  }
@@ -708,11 +815,14 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
708
815
  return s;
709
816
  }
710
817
 
711
- static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
712
- VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
818
+ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
713
819
  if (ctx->seqs.empty()) {
820
+ if (fence) {
821
+ ctx->q->queue.submit({}, fence);
822
+ }
714
823
  return;
715
824
  }
825
+ VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
716
826
 
717
827
  std::vector<std::vector<uint64_t>> tl_wait_vals;
718
828
  std::vector<std::vector<uint64_t>> tl_signal_vals;
@@ -828,11 +938,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
828
938
  abort();
829
939
  }
830
940
 
831
- static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
941
+ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only) {
832
942
  VK_LOG_DEBUG("ggml_vk_create_queue()");
833
943
  std::lock_guard<std::mutex> guard(device->mutex);
834
944
 
835
945
  q.queue_family_index = queue_family_index;
946
+ q.transfer_only = transfer_only;
836
947
 
837
948
  vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
838
949
  q.pool = device->device.createCommandPool(command_pool_create_info_compute);
@@ -844,21 +955,17 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
844
955
  q.stage_flags = stage_flags;
845
956
  }
846
957
 
847
- static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
848
- VK_LOG_DEBUG("ggml_vk_create_context()");
849
- ctx->gc.contexts.emplace_back();
850
- vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
851
- memset((void *) result, 0, sizeof(vk_context));
852
- result->idx = ctx->gc.contexts.size() - 1;
958
+ static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
959
+ vk_context result = std::make_shared<vk_context_struct>();
960
+ VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
961
+ ctx->gc.contexts.emplace_back(result);
853
962
  result->q = &q;
854
963
  return result;
855
964
  }
856
965
 
857
- static vk_context * ggml_vk_create_temporary_context(vk_queue& q) {
858
- VK_LOG_DEBUG("ggml_vk_create_temporary_context()");
859
- vk_context * result = new vk_context;
860
- memset((void *) result, 0, sizeof(vk_context));
861
- result->idx = 0;
966
+ static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
967
+ vk_context result = std::make_shared<vk_context_struct>();
968
+ VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
862
969
  result->q = &q;
863
970
  return result;
864
971
  }
@@ -915,6 +1022,10 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
915
1022
 
916
1023
  static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
917
1024
  VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
1025
+ if (size > device->max_memory_allocation_size) {
1026
+ throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
1027
+ }
1028
+
918
1029
  std::lock_guard<std::mutex> guard(device->mutex);
919
1030
 
920
1031
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
@@ -959,10 +1070,25 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
959
1070
  try {
960
1071
  buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index });
961
1072
  } catch (const vk::SystemError& e) {
962
- // Out of Host/Device memory, clean up buffer
963
- device->device.destroyBuffer(buf->buffer);
964
- buf->size = 0;
965
- throw e;
1073
+ if (buf->memory_property_flags != fallback_flags) {
1074
+ // Try again with fallback flags
1075
+ memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
1076
+ buf->memory_property_flags = fallback_flags;
1077
+
1078
+ try {
1079
+ buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index });
1080
+ }
1081
+ catch (const vk::SystemError& e) {
1082
+ device->device.destroyBuffer(buf->buffer);
1083
+ buf->size = 0;
1084
+ throw e;
1085
+ }
1086
+ } else {
1087
+ // Out of Host/Device memory, clean up buffer
1088
+ device->device.destroyBuffer(buf->buffer);
1089
+ buf->size = 0;
1090
+ throw e;
1091
+ }
966
1092
  }
967
1093
  buf->ptr = nullptr;
968
1094
 
@@ -998,7 +1124,8 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
998
1124
  // Fall back to host memory type
999
1125
  buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
1000
1126
  } else {
1001
- buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
1127
+ // use rebar if available, otherwise fallback to device only visible memory
1128
+ buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal);
1002
1129
  }
1003
1130
  } catch (const vk::SystemError& e) {
1004
1131
  std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
@@ -1027,21 +1154,25 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
1027
1154
  return { buf, 0, VK_WHOLE_SIZE };
1028
1155
  }
1029
1156
 
1030
- static void ggml_vk_sync_buffers(vk_context * ctx) {
1157
+ static void ggml_vk_sync_buffers(vk_context& ctx) {
1031
1158
  VK_LOG_DEBUG("ggml_vk_sync_buffers()");
1032
- const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
1159
+
1160
+ const bool transfer_queue = ctx->q->transfer_only;
1033
1161
 
1034
1162
  ctx->s->buffer.pipelineBarrier(
1035
1163
  ctx->q->stage_flags,
1036
1164
  ctx->q->stage_flags,
1037
1165
  {},
1038
- mem_barriers,
1166
+ { {
1167
+ { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
1168
+ { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }
1169
+ } },
1039
1170
  {},
1040
1171
  {}
1041
1172
  );
1042
1173
  }
1043
1174
 
1044
- static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
1175
+ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) {
1045
1176
  VK_LOG_DEBUG("ggml_vk_wait_events()");
1046
1177
  if (events.empty()) {
1047
1178
  return;
@@ -1063,11 +1194,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
1063
1194
  // mulmat
1064
1195
  std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
1065
1196
  std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
1066
- std::initializer_list<uint32_t> warptile_s = { device->subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
1197
+ std::initializer_list<uint32_t> warptile_s = { std::max(device->subgroup_size, 16u), 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
1067
1198
 
1068
1199
  std::initializer_list<uint32_t> warptile_mmq_l = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
1069
1200
  std::initializer_list<uint32_t> warptile_mmq_m = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
1070
- std::initializer_list<uint32_t> warptile_mmq_s = { device->subgroup_size, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
1201
+ std::initializer_list<uint32_t> warptile_mmq_s = { std::max(device->subgroup_size, 16u), 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
1071
1202
 
1072
1203
  std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
1073
1204
  std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 };
@@ -1108,6 +1239,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
1108
1239
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1109
1240
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
1110
1241
 
1242
+ std::vector<std::future<void>> compiles;
1243
+ auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
1244
+ {
1245
+ // wait until fewer than N compiles are in progress
1246
+ uint32_t N = std::max(1u, std::thread::hardware_concurrency());
1247
+ std::unique_lock<std::mutex> guard(compile_count_mutex);
1248
+ while (compile_count >= N) {
1249
+ compile_count_cond.wait(guard);
1250
+ }
1251
+ compile_count++;
1252
+ }
1253
+ compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align));
1254
+ };
1255
+
1111
1256
  if (device->fp16) {
1112
1257
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1113
1258
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1598,6 +1743,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1598
1743
  ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
1599
1744
 
1600
1745
  ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
1746
+ ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
1601
1747
  ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
1602
1748
 
1603
1749
  ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@@ -1605,20 +1751,37 @@ static void ggml_vk_load_shaders(vk_device& device) {
1605
1751
  ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1606
1752
 
1607
1753
  ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1754
+ ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1608
1755
 
1609
- ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1756
+ ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1610
1757
 
1758
+ ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1611
1759
  ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1612
1760
 
1761
+ ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1762
+ ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1763
+ ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1764
+
1765
+ ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);
1766
+
1613
1767
  ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1614
1768
 
1615
1769
  ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1770
+ ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1771
+ ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1616
1772
 
1617
1773
  ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1618
1774
 
1775
+ ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1776
+
1777
+ ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1778
+
1619
1779
  ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1780
+ ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1620
1781
  ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1621
1782
  ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1783
+ ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1784
+ ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1622
1785
 
1623
1786
  ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
1624
1787
 
@@ -1634,6 +1797,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
1634
1797
  ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1635
1798
 
1636
1799
  ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1800
+
1801
+ ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
1802
+ ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
1803
+
1804
+ ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
1805
+
1806
+ for (auto &c : compiles) {
1807
+ c.wait();
1808
+ }
1637
1809
  }
1638
1810
 
1639
1811
  static vk_device ggml_vk_get_device(size_t idx) {
@@ -1647,6 +1819,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
1647
1819
  #ifdef GGML_VULKAN_MEMORY_DEBUG
1648
1820
  device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
1649
1821
  #endif
1822
+ #ifdef GGML_VULKAN_PERF
1823
+ device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
1824
+ #endif
1650
1825
 
1651
1826
  size_t dev_num = vk_instance.device_indices[idx];
1652
1827
 
@@ -1777,17 +1952,15 @@ static vk_device ggml_vk_get_device(size_t idx) {
1777
1952
  device_create_info.setPNext(&device_features2);
1778
1953
  device->device = device->physical_device.createDevice(device_create_info);
1779
1954
 
1780
- device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN;
1781
-
1782
1955
  // Queues
1783
- ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer });
1956
+ ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false);
1784
1957
 
1785
1958
  // Shaders
1786
1959
  ggml_vk_load_shaders(device);
1787
1960
 
1788
1961
  if (!device->single_queue) {
1789
1962
  const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
1790
- ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer });
1963
+ ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
1791
1964
  } else {
1792
1965
  // TODO: Use pointer or reference to avoid copy
1793
1966
  device->transfer_queue = device->compute_queue;
@@ -1795,6 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
1795
1968
 
1796
1969
  device->buffer_type = {
1797
1970
  /* .iface = */ ggml_backend_vk_buffer_type_interface,
1971
+ /* .device = */ nullptr,
1798
1972
  /* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
1799
1973
  };
1800
1974
 
@@ -2057,9 +2231,9 @@ void ggml_vk_instance_init() {
2057
2231
  }
2058
2232
 
2059
2233
  static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
2060
- GGML_ASSERT(idx < vk_instance.device_indices.size());
2061
2234
  VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
2062
2235
  ggml_vk_instance_init();
2236
+ GGML_ASSERT(idx < vk_instance.device_indices.size());
2063
2237
 
2064
2238
  ctx->name = GGML_VK_NAME + std::to_string(idx);
2065
2239
 
@@ -2074,12 +2248,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
2074
2248
 
2075
2249
  ctx->fence = ctx->device->device.createFence({});
2076
2250
 
2077
- ctx->staging_size = 0;
2078
- ctx->staging_offset = 0;
2079
-
2080
- ctx->compute_ctx = nullptr;
2081
- ctx->transfer_ctx = nullptr;
2082
-
2083
2251
  #ifdef GGML_VULKAN_CHECK_RESULTS
2084
2252
  const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
2085
2253
  vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -2112,7 +2280,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
2112
2280
  }
2113
2281
 
2114
2282
  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
2115
- VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
2283
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
2116
2284
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
2117
2285
  return ctx->device->pipeline_matmul_f32;
2118
2286
  }
@@ -2126,7 +2294,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
2126
2294
  return ctx->device->pipeline_matmul_f16;
2127
2295
  }
2128
2296
 
2129
- GGML_ASSERT(src1_type == GGML_TYPE_F32);
2297
+ if (src1_type != GGML_TYPE_F32) {
2298
+ return nullptr;
2299
+ }
2130
2300
 
2131
2301
  switch (src0_type) {
2132
2302
  case GGML_TYPE_Q4_0:
@@ -2370,28 +2540,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
2370
2540
  return s;
2371
2541
  }
2372
2542
 
2373
- static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
2543
+
2544
+
2545
+ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
2374
2546
  const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
2375
2547
  const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
2376
2548
  const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
2377
2549
  VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2378
- for (auto& buffer : buffers) {
2379
- std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
2550
+ for (auto& buffer : descriptor_buffer_infos) {
2551
+ std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
2380
2552
  }
2381
2553
  std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
2382
- std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
2383
- std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
2384
2554
  GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
2385
- GGML_ASSERT(buffers.size() == pipeline->parameter_count);
2386
- vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
2387
- for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
2388
- descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size});
2389
- }
2390
- for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
2391
- write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
2392
- }
2555
+ GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
2393
2556
 
2394
- ctx->device->device.updateDescriptorSets(write_descriptor_sets, {});
2557
+ vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
2558
+ vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
2559
+ ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
2395
2560
 
2396
2561
  subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
2397
2562
  subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
@@ -2410,7 +2575,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
2410
2575
  s.signal_semaphores = std::move(signal_semaphores);
2411
2576
  }
2412
2577
 
2413
- static void ggml_vk_ctx_end(vk_context * ctx) {
2578
+ static void ggml_vk_ctx_end(vk_context& ctx) {
2414
2579
  VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
2415
2580
  if (ctx->s == nullptr) {
2416
2581
  return;
@@ -2420,7 +2585,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
2420
2585
  ctx->s = nullptr;
2421
2586
  }
2422
2587
 
2423
- static void ggml_vk_ctx_begin(vk_device& device, vk_context * subctx) {
2588
+ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
2424
2589
  VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")");
2425
2590
  if (subctx->s != nullptr) {
2426
2591
  ggml_vk_ctx_end(subctx);
@@ -2453,7 +2618,7 @@ static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
2453
2618
  }
2454
2619
  }
2455
2620
 
2456
- static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
2621
+ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
2457
2622
  VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
2458
2623
  GGML_ASSERT(!ggml_is_contiguous(tensor));
2459
2624
  // Buffer is already mapped
@@ -2515,23 +2680,15 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
2515
2680
  return;
2516
2681
  }
2517
2682
 
2518
- // Staging buffer required
2519
- vk_buffer staging = ctx->staging;
2520
- size_t staging_offset = ctx->staging_offset;
2521
- const size_t copy_size = ts*ne/bs;
2522
- if (ctx->staging->size < ctx->staging_offset + copy_size) {
2523
- if (sync_staging) {
2524
- // Create temporary larger buffer
2525
- ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size);
2526
-
2527
- staging = ctx->device->sync_staging;
2528
- staging_offset = 0;
2529
- } else {
2530
- GGML_ABORT("fatal error");
2531
- }
2683
+ if (!sync_staging) {
2684
+ GGML_ABORT("Asynchronous write to non-pinned memory not supported");
2532
2685
  }
2533
2686
 
2534
- VkBufferCopy buf_copy{ staging_offset, offset, copy_size };
2687
+ // Staging buffer required
2688
+ vk_buffer& staging = ctx->device->sync_staging;
2689
+ const uint64_t copy_size = ts*ne/bs;
2690
+ ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size);
2691
+ VkBufferCopy buf_copy{ 0, offset, copy_size };
2535
2692
 
2536
2693
  ggml_vk_sync_buffers(subctx);
2537
2694
  vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
@@ -2540,14 +2697,14 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
2540
2697
  for (uint64_t i2 = 0; i2 < ne2; i2++) {
2541
2698
  // Find longest contiguous slice
2542
2699
  if (ne1*nb1 == dstnb2) {
2543
- deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys);
2700
+ deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys);
2544
2701
  } else {
2545
2702
  for (uint64_t i1 = 0; i1 < ne1; i1++) {
2546
2703
  if (ne0*nb0/bs == dstnb1) {
2547
- deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys);
2704
+ deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys);
2548
2705
  } else {
2549
2706
  const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
2550
- const uint64_t d_off = staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
2707
+ const uint64_t d_off = i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
2551
2708
  for (uint64_t i0 = 0; i0 < ne0; i0++) {
2552
2709
  deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys);
2553
2710
  }
@@ -2558,7 +2715,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
2558
2715
  }
2559
2716
  }
2560
2717
 
2561
- static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
2718
+ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
2562
2719
  VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
2563
2720
  // Buffer is already mapped
2564
2721
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
@@ -2593,21 +2750,18 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
2593
2750
  }
2594
2751
  VK_LOG_DEBUG("STAGING");
2595
2752
 
2753
+ if (!sync_staging) {
2754
+ GGML_ABORT("Asynchronous write to non-pinned memory not supported");
2755
+ }
2756
+
2596
2757
  // Staging buffer required
2597
2758
  const size_t copy_size = width*height;
2598
- if (staging_buffer == nullptr || staging_buffer->size < staging_offset + copy_size) {
2599
- if (sync_staging) {
2600
- ggml_vk_ensure_sync_staging_buffer(dst->device, copy_size);
2759
+ ggml_vk_ensure_sync_staging_buffer(dst->device, copy_size);
2601
2760
 
2602
- staging_buffer = dst->device->sync_staging;
2603
- staging_offset = 0;
2604
- } else {
2605
- GGML_ABORT("fatal error");
2606
- }
2607
- }
2761
+ vk_buffer& staging_buffer = dst->device->sync_staging;
2608
2762
 
2609
2763
  VkBufferCopy buf_copy = {
2610
- staging_offset,
2764
+ 0,
2611
2765
  offset,
2612
2766
  copy_size};
2613
2767
 
@@ -2615,17 +2769,17 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
2615
2769
  vkCmdCopyBuffer(subctx->s->buffer, staging_buffer->buffer, dst->buffer, 1, &buf_copy);
2616
2770
 
2617
2771
  if (width == spitch) {
2618
- deferred_memcpy((uint8_t *)staging_buffer->ptr + staging_offset, src, width * height, &subctx->in_memcpys);
2772
+ deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
2619
2773
  } else {
2620
2774
  for (size_t i = 0; i < height; i++) {
2621
- deferred_memcpy((uint8_t *)staging_buffer->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
2775
+ deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
2622
2776
  }
2623
2777
  }
2624
2778
  }
2625
2779
 
2626
- static void ggml_vk_buffer_write_async(vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
2780
+ static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
2627
2781
  VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
2628
- return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, staging_buffer, staging_offset, sync_staging);
2782
+ return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, sync_staging);
2629
2783
  }
2630
2784
 
2631
2785
  static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
@@ -2638,9 +2792,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
2638
2792
  memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
2639
2793
  }
2640
2794
  } else {
2641
- vk_context * subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
2795
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
2642
2796
  ggml_vk_ctx_begin(dst->device, subctx);
2643
- ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, nullptr, 0, true);
2797
+ ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
2644
2798
  ggml_vk_ctx_end(subctx);
2645
2799
 
2646
2800
  for (auto& cpy : subctx->in_memcpys) {
@@ -2650,8 +2804,6 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
2650
2804
  ggml_vk_submit(subctx, dst->device->fence);
2651
2805
  VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
2652
2806
  dst->device->device.resetFences({ dst->device->fence });
2653
-
2654
- delete subctx;
2655
2807
  }
2656
2808
  }
2657
2809
 
@@ -2660,12 +2812,14 @@ static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src
2660
2812
  ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1);
2661
2813
  }
2662
2814
 
2663
- static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
2815
+ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
2664
2816
  VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
2665
2817
  GGML_ASSERT(width > 0);
2666
2818
  GGML_ASSERT(height > 0);
2667
2819
  GGML_ASSERT(src != nullptr);
2668
2820
 
2821
+ // TODO: staging_offset is not used
2822
+
2669
2823
  // Check if dst is pinned memory
2670
2824
  vk_buffer buf = nullptr;
2671
2825
  size_t buf_offset;
@@ -2695,18 +2849,15 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
2695
2849
  }
2696
2850
  VK_LOG_DEBUG("STAGING");
2697
2851
 
2852
+ if (!sync_staging) {
2853
+ GGML_ABORT("Asynchronous read from non-pinned memory not supported");
2854
+ }
2855
+
2698
2856
  // Fall back to staging buffer
2699
2857
  const size_t copy_size = dpitch * height;
2700
- if (staging_buffer == nullptr || staging_buffer->size < staging_offset + copy_size) {
2701
- if (sync_staging) {
2702
- // Create temporary larger buffer
2703
- ggml_vk_ensure_sync_staging_buffer(src->device, copy_size);
2858
+ ggml_vk_ensure_sync_staging_buffer(src->device, copy_size);
2704
2859
 
2705
- staging_buffer = src->device->sync_staging;
2706
- } else {
2707
- GGML_ABORT("fatal error");
2708
- }
2709
- }
2860
+ vk_buffer& staging_buffer = src->device->sync_staging;
2710
2861
 
2711
2862
  ggml_vk_sync_buffers(subctx);
2712
2863
  subctx->s->buffer.copyBuffer(src->buffer, staging_buffer->buffer, slices);
@@ -2714,20 +2865,24 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
2714
2865
  deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
2715
2866
  }
2716
2867
 
2717
- static void ggml_vk_buffer_read_async(vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
2718
- return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, staging_buffer, staging_offset, sync_staging);
2868
+ static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) {
2869
+ return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, sync_staging);
2719
2870
  }
2720
2871
 
2721
2872
  static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
2722
- VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
2723
- if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2873
+ VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
2874
+
2875
+ // If the device is not an UMA device the memory is host-accessible through rebar. While writing
2876
+ // through PCIe is sufficient fast reading back data from PCIe is slower than going through
2877
+ // the HW device to host copy path.
2878
+ if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
2724
2879
  GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
2725
2880
 
2726
2881
  memcpy(dst, (uint8_t *) src->ptr + offset, size);
2727
2882
  } else {
2728
- vk_context * subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
2883
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
2729
2884
  ggml_vk_ctx_begin(src->device, subctx);
2730
- ggml_vk_buffer_read_async(subctx, src, offset, dst, size, nullptr, 0, true);
2885
+ ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
2731
2886
  ggml_vk_ctx_end(subctx);
2732
2887
 
2733
2888
  ggml_vk_submit(subctx, src->device->fence);
@@ -2737,12 +2892,10 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
2737
2892
  for (auto& cpy : subctx->out_memcpys) {
2738
2893
  memcpy(cpy.dst, cpy.src, cpy.n);
2739
2894
  }
2740
-
2741
- delete subctx;
2742
2895
  }
2743
2896
  }
2744
2897
 
2745
- static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2898
+ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2746
2899
  VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
2747
2900
  // Make sure both buffers are on same device
2748
2901
  GGML_ASSERT(src->device == dst->device);
@@ -2756,15 +2909,13 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2756
2909
  if (src->device == dst->device) {
2757
2910
  VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
2758
2911
  // Copy within the device
2759
- vk_context * subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
2912
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
2760
2913
  ggml_vk_ctx_begin(src->device, subctx);
2761
2914
  ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
2762
2915
  ggml_vk_ctx_end(subctx);
2763
2916
  ggml_vk_submit(subctx, src->device->fence);
2764
2917
  VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
2765
2918
  src->device->device.resetFences({ src->device->fence });
2766
-
2767
- delete subctx;
2768
2919
  } else {
2769
2920
  VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
2770
2921
  // Copy device to device
@@ -2783,7 +2934,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2783
2934
  static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
2784
2935
  VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
2785
2936
 
2786
- vk_context * subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
2937
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
2787
2938
  ggml_vk_ctx_begin(dst->device, subctx);
2788
2939
  subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
2789
2940
  ggml_vk_ctx_end(subctx);
@@ -2791,8 +2942,6 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
2791
2942
  ggml_vk_submit(subctx, dst->device->fence);
2792
2943
  VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
2793
2944
  dst->device->device.resetFences({ dst->device->fence });
2794
-
2795
- delete subctx;
2796
2945
  }
2797
2946
 
2798
2947
  static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
@@ -2855,7 +3004,7 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
2855
3004
  }
2856
3005
 
2857
3006
  static void ggml_vk_matmul(
2858
- ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline,
3007
+ ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline,
2859
3008
  vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer,
2860
3009
  uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2861
3010
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
@@ -2879,7 +3028,7 @@ static void ggml_vk_matmul(
2879
3028
  }
2880
3029
 
2881
3030
  static void ggml_vk_matmul_id(
2882
- ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline,
3031
+ ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline,
2883
3032
  vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids,
2884
3033
  uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2885
3034
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
@@ -2916,7 +3065,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2916
3065
  GGML_ABORT("fatal error");
2917
3066
  }
2918
3067
 
2919
- static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
3068
+ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2920
3069
  VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2921
3070
  std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
2922
3071
  const int tensor_type_size = ggml_type_size(tensor->type);
@@ -2934,10 +3083,11 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
2934
3083
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
2935
3084
  }
2936
3085
 
2937
- static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3086
+ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
2938
3087
  VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2939
3088
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2940
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3089
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
3090
+ std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
2941
3091
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2942
3092
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
2943
3093
 
@@ -2957,9 +3107,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2957
3107
  const uint64_t r2 = ne12 / ne02;
2958
3108
  const uint64_t r3 = ne13 / ne03;
2959
3109
 
2960
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
2961
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
2962
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3110
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3111
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3112
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
2963
3113
 
2964
3114
  vk_buffer d_Qx;
2965
3115
  size_t qx_buf_offset = 0;
@@ -3011,8 +3161,58 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
3011
3161
  const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
3012
3162
  const uint64_t d_sz = sizeof(float) * d_ne;
3013
3163
 
3014
- vk_buffer d_D = extra->buffer_gpu.lock();
3015
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3164
+ vk_pipeline to_fp16_vk_0 = nullptr;
3165
+ vk_pipeline to_fp16_vk_1 = nullptr;
3166
+
3167
+ if (x_non_contig) {
3168
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
3169
+ } else {
3170
+ to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
3171
+ }
3172
+ if (y_non_contig) {
3173
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
3174
+ } else {
3175
+ to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3176
+ }
3177
+ GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
3178
+ GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
3179
+
3180
+ if (dryrun) {
3181
+ const uint64_t x_sz_upd = x_sz * ne02 * ne03;
3182
+ const uint64_t y_sz_upd = y_sz * ne12 * ne13;
3183
+ const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * 4 : 0;
3184
+ if (
3185
+ (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
3186
+ (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
3187
+ (split_k > 1 && split_k_size > ctx->device->max_memory_allocation_size)) {
3188
+ GGML_ABORT("Requested preallocation size is too large");
3189
+ }
3190
+ if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
3191
+ ctx->prealloc_size_x = x_sz_upd;
3192
+ }
3193
+ if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
3194
+ ctx->prealloc_size_y = y_sz_upd;
3195
+ }
3196
+ if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) {
3197
+ ctx->prealloc_size_split_k = split_k_size;
3198
+ }
3199
+
3200
+ // Request descriptor sets
3201
+ ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
3202
+ if (qx_needs_dequant) {
3203
+ ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
3204
+ }
3205
+ if (qy_needs_dequant) {
3206
+ ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
3207
+ }
3208
+ if (split_k > 1) {
3209
+ ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
3210
+ }
3211
+ return;
3212
+ }
3213
+
3214
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3215
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3016
3216
  GGML_ASSERT(d_D != nullptr);
3017
3217
  GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
3018
3218
  vk_buffer d_X;
@@ -3020,13 +3220,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
3020
3220
  vk_buffer d_Y;
3021
3221
  uint64_t y_buf_offset = 0;
3022
3222
  if (!src0_uma) {
3023
- d_Qx = extra_src0->buffer_gpu.lock();
3024
- qx_buf_offset = extra_src0->offset + src0->view_offs;
3223
+ d_Qx = src0_buf_ctx->dev_buffer;
3224
+ qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3025
3225
  GGML_ASSERT(d_Qx != nullptr);
3026
3226
  }
3027
3227
  if (!src1_uma) {
3028
- d_Qy = extra_src1->buffer_gpu.lock();
3029
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3228
+ d_Qy = src1_buf_ctx->dev_buffer;
3229
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3030
3230
  GGML_ASSERT(d_Qy != nullptr);
3031
3231
  }
3032
3232
  if (qx_needs_dequant) {
@@ -3046,40 +3246,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
3046
3246
  GGML_ASSERT(qy_sz == y_sz);
3047
3247
  }
3048
3248
 
3049
- vk_pipeline to_fp16_vk_0 = nullptr;
3050
- vk_pipeline to_fp16_vk_1 = nullptr;
3051
-
3052
- if (x_non_contig) {
3053
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
3054
- } else {
3055
- to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
3056
- }
3057
- if (y_non_contig) {
3058
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
3059
- } else {
3060
- to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3061
- }
3062
- GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
3063
- GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
3064
-
3065
- // Allocate descriptor sets
3066
- ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1);
3067
- if (qx_needs_dequant) {
3068
- ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
3069
- }
3070
- if (qy_needs_dequant) {
3071
- ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
3072
- }
3073
- if (split_k > 1) {
3074
- ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
3075
- }
3076
-
3077
3249
  if (x_non_contig) {
3078
3250
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
3079
3251
  } else if (qx_needs_dequant) {
3080
3252
  const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
3081
3253
  ggml_vk_sync_buffers(subctx);
3082
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
3254
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
3083
3255
  }
3084
3256
  if (y_non_contig) {
3085
3257
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -3107,10 +3279,11 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
3107
3279
  ); // NOLINT
3108
3280
  }
3109
3281
 
3110
- static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3282
+ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
3111
3283
  VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3112
3284
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3113
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3285
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
3286
+ std::cerr << "), " << (dryrun ? "dryrun" : "") << "),)");
3114
3287
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3115
3288
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3116
3289
 
@@ -3134,9 +3307,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3134
3307
  const uint64_t r2 = ne12 / ne02;
3135
3308
  const uint64_t r3 = ne13 / ne03;
3136
3309
 
3137
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3138
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3139
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3310
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3311
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3312
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3140
3313
 
3141
3314
  vk_buffer d_Qx;
3142
3315
  size_t qx_buf_offset = 0;
@@ -3174,21 +3347,62 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3174
3347
  const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
3175
3348
  const uint64_t d_sz = sizeof(float) * d_ne;
3176
3349
 
3177
- vk_buffer d_D = extra->buffer_gpu.lock();
3178
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3350
+ vk_pipeline to_fp16_vk_0 = nullptr;
3351
+ vk_pipeline to_fp16_vk_1 = nullptr;
3352
+ if (x_non_contig) {
3353
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
3354
+ }
3355
+ if (y_non_contig) {
3356
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
3357
+ } else {
3358
+ to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3359
+ }
3360
+ vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type);
3361
+ GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
3362
+ GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
3363
+ GGML_ASSERT(dmmv != nullptr);
3364
+
3365
+ if (dryrun) {
3366
+ const uint64_t x_sz_upd = x_sz * ne02 * ne03;
3367
+ const uint64_t y_sz_upd = y_sz * ne12 * ne13;
3368
+ if (
3369
+ (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
3370
+ (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
3371
+ GGML_ABORT("Requested preallocation size is too large");
3372
+ }
3373
+ if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
3374
+ ctx->prealloc_size_x = x_sz_upd;
3375
+ }
3376
+ if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
3377
+ ctx->prealloc_size_y = y_sz_upd;
3378
+ }
3379
+
3380
+ // Request descriptor sets
3381
+ if (qx_needs_dequant) {
3382
+ ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
3383
+ }
3384
+ if (qy_needs_dequant) {
3385
+ ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
3386
+ }
3387
+ ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
3388
+ return;
3389
+ }
3390
+
3391
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3392
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3179
3393
  GGML_ASSERT(d_D != nullptr);
3180
3394
  vk_buffer d_X;
3181
3395
  uint64_t x_buf_offset = 0;
3182
3396
  vk_buffer d_Y;
3183
3397
  uint64_t y_buf_offset = 0;
3184
3398
  if(!src0_uma) {
3185
- d_Qx = extra_src0->buffer_gpu.lock();
3186
- qx_buf_offset = extra_src0->offset + src0->view_offs;
3399
+ d_Qx = src0_buf_ctx->dev_buffer;
3400
+ qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3187
3401
  GGML_ASSERT(d_Qx != nullptr);
3188
3402
  }
3189
3403
  if(!src1_uma) {
3190
- d_Qy = extra_src1->buffer_gpu.lock();
3191
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3404
+ d_Qy = src1_buf_ctx->dev_buffer;
3405
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3192
3406
  GGML_ASSERT(d_Qy != nullptr);
3193
3407
  }
3194
3408
  if (qx_needs_dequant) {
@@ -3206,30 +3420,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3206
3420
  GGML_ASSERT(qy_sz == y_sz);
3207
3421
  }
3208
3422
 
3209
- vk_pipeline to_fp16_vk_0 = nullptr;
3210
- vk_pipeline to_fp16_vk_1 = nullptr;
3211
- if (x_non_contig) {
3212
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
3213
- }
3214
- if (y_non_contig) {
3215
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
3216
- } else {
3217
- to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3218
- }
3219
- vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type);
3220
- GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
3221
- GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
3222
- GGML_ASSERT(dmmv != nullptr);
3223
-
3224
- // Allocate descriptor sets
3225
- if (qx_needs_dequant) {
3226
- ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
3227
- }
3228
- if (qy_needs_dequant) {
3229
- ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
3230
- }
3231
- ggml_pipeline_allocate_descriptor_sets(ctx->device, dmmv, ne12 * ne13);
3232
-
3233
3423
  if (x_non_contig) {
3234
3424
  GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
3235
3425
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
@@ -3268,14 +3458,15 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3268
3458
  };
3269
3459
  ggml_vk_sync_buffers(subctx);
3270
3460
  ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
3271
- { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} },
3461
+ { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
3272
3462
  sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
3273
3463
  }
3274
3464
 
3275
- static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3465
+ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
3276
3466
  VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3277
3467
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3278
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3468
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
3469
+ std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
3279
3470
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3280
3471
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3281
3472
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
@@ -3294,9 +3485,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3294
3485
 
3295
3486
  GGML_ASSERT(ne11 == 1);
3296
3487
 
3297
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3298
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3299
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3488
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3489
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3490
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3300
3491
 
3301
3492
  vk_buffer d_Qy;
3302
3493
  size_t qy_buf_offset = 0;
@@ -3316,21 +3507,24 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3316
3507
  const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
3317
3508
  const uint64_t d_sz = sizeof(float) * d_ne;
3318
3509
 
3319
- vk_buffer d_D = extra->buffer_gpu.lock();
3320
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3510
+ if (dryrun) {
3511
+ // Request descriptor sets
3512
+ ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1);
3513
+ return;
3514
+ }
3515
+
3516
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3517
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3321
3518
  GGML_ASSERT(d_D != nullptr);
3322
- vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3323
- const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3519
+ vk_buffer d_Qx = src0_buf_ctx->dev_buffer;
3520
+ const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3324
3521
  GGML_ASSERT(d_Qx != nullptr);
3325
3522
  if (!src1_uma) {
3326
- d_Qy = extra_src1->buffer_gpu.lock();
3327
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3523
+ d_Qy = src1_buf_ctx->dev_buffer;
3524
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3328
3525
  GGML_ASSERT(d_Qx != nullptr);
3329
3526
  }
3330
3527
 
3331
- // Allocate descriptor sets
3332
- ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1);
3333
-
3334
3528
  const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
3335
3529
  const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
3336
3530
 
@@ -3340,13 +3534,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3340
3534
  // compute
3341
3535
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3342
3536
  ggml_vk_sync_buffers(subctx);
3343
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3537
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3344
3538
  }
3345
3539
 
3346
- static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3540
+ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
3347
3541
  VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3348
3542
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3349
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3543
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
3544
+ std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
3350
3545
  GGML_ASSERT(!ggml_is_transposed(src0));
3351
3546
  GGML_ASSERT(!ggml_is_transposed(src1));
3352
3547
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -3368,9 +3563,9 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3368
3563
 
3369
3564
  GGML_ASSERT(ne11 == 1);
3370
3565
 
3371
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3372
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3373
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3566
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3567
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3568
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3374
3569
 
3375
3570
  vk_buffer d_Qy = nullptr;
3376
3571
  size_t qy_buf_offset = 0;
@@ -3391,21 +3586,24 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3391
3586
  const uint64_t qy_sz = ggml_nbytes(src1);
3392
3587
  const uint64_t d_sz = sizeof(float) * d_ne;
3393
3588
 
3394
- vk_buffer d_D = extra->buffer_gpu.lock();
3395
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3396
- GGML_ASSERT(d_D != nullptr);
3397
- vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3398
- const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3589
+ if (dryrun) {
3590
+ // Request descriptor sets
3591
+ ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
3592
+ return;
3593
+ }
3594
+
3595
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3596
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3597
+ GGML_ASSERT(d_D != nullptr);
3598
+ vk_buffer d_Qx = src0_buf_ctx->dev_buffer;
3599
+ const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3399
3600
  GGML_ASSERT(d_Qx != nullptr);
3400
3601
  if (!src1_uma) {
3401
- d_Qy = extra_src1->buffer_gpu.lock();
3402
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3602
+ d_Qy = src1_buf_ctx->dev_buffer;
3603
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3403
3604
  GGML_ASSERT(d_Qx != nullptr);
3404
3605
  }
3405
3606
 
3406
- // Allocate descriptor sets
3407
- ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
3408
-
3409
3607
  const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
3410
3608
  const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
3411
3609
 
@@ -3415,23 +3613,24 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3415
3613
  // compute
3416
3614
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3417
3615
  ggml_vk_sync_buffers(subctx);
3418
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3616
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
3617
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3419
3618
  }
3420
3619
 
3421
- static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3620
+ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
3422
3621
  VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
3423
3622
  if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
3424
- ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
3623
+ ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
3425
3624
  } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
3426
- ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst);
3625
+ ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
3427
3626
  } else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
3428
- ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst);
3627
+ ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun);
3429
3628
  } else {
3430
- ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst);
3629
+ ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, dryrun);
3431
3630
  }
3432
3631
  }
3433
3632
 
3434
- static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3633
+ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) {
3435
3634
  VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3436
3635
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3437
3636
  std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
@@ -3463,10 +3662,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3463
3662
 
3464
3663
  const uint64_t n_as = ne02;
3465
3664
 
3466
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3467
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3468
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3469
- ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra;
3665
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3666
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3667
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3668
+ ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
3470
3669
 
3471
3670
  vk_buffer d_Qx;
3472
3671
  size_t qx_buf_offset = 0;
@@ -3521,26 +3720,68 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3521
3720
  const uint64_t ids_sz = nbi2;
3522
3721
  const uint64_t d_sz = sizeof(float) * d_ne;
3523
3722
 
3524
- vk_buffer d_D = extra->buffer_gpu.lock();
3525
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3723
+ vk_pipeline to_fp16_vk_0 = nullptr;
3724
+ vk_pipeline to_fp16_vk_1 = nullptr;
3725
+
3726
+ if (x_non_contig) {
3727
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
3728
+ } else {
3729
+ to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
3730
+ }
3731
+ if (y_non_contig) {
3732
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
3733
+ } else {
3734
+ to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3735
+ }
3736
+ GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
3737
+ GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
3738
+
3739
+ if (dryrun) {
3740
+ const uint64_t x_sz_upd = x_sz * ne02 * ne03;
3741
+ const uint64_t y_sz_upd = y_sz * ne12 * ne13;
3742
+ if (
3743
+ (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
3744
+ (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
3745
+ GGML_ABORT("Requested preallocation size is too large");
3746
+ }
3747
+ if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
3748
+ ctx->prealloc_size_x = x_sz_upd;
3749
+ }
3750
+ if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
3751
+ ctx->prealloc_size_y = y_sz_upd;
3752
+ }
3753
+
3754
+ // Request descriptor sets
3755
+ ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
3756
+ if (qx_needs_dequant) {
3757
+ ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
3758
+ }
3759
+ if (qy_needs_dequant) {
3760
+ ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
3761
+ }
3762
+ return;
3763
+ }
3764
+
3765
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3766
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3526
3767
  GGML_ASSERT(d_D != nullptr);
3527
3768
  vk_buffer d_X;
3528
3769
  uint64_t x_buf_offset = 0;
3529
3770
  vk_buffer d_Y;
3530
3771
  uint64_t y_buf_offset = 0;
3531
3772
  if (!src0_uma) {
3532
- d_Qx = extra_src0->buffer_gpu.lock();
3533
- qx_buf_offset = extra_src0->offset + src0->view_offs;
3773
+ d_Qx = src0_buf_ctx->dev_buffer;
3774
+ qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3534
3775
  GGML_ASSERT(d_Qx != nullptr);
3535
3776
  }
3536
3777
  if (!src1_uma) {
3537
- d_Qy = extra_src1->buffer_gpu.lock();
3538
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3778
+ d_Qy = src1_buf_ctx->dev_buffer;
3779
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3539
3780
  GGML_ASSERT(d_Qy != nullptr);
3540
3781
  }
3541
3782
  if (!ids_uma) {
3542
- d_ids = extra_ids->buffer_gpu.lock();
3543
- ids_buf_offset = extra_ids->offset + ids->view_offs;
3783
+ d_ids = ids_buf_ctx->dev_buffer;
3784
+ ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
3544
3785
  GGML_ASSERT(d_ids != nullptr);
3545
3786
  }
3546
3787
  if (qx_needs_dequant) {
@@ -3560,37 +3801,13 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3560
3801
  GGML_ASSERT(qy_sz == y_sz);
3561
3802
  }
3562
3803
 
3563
- vk_pipeline to_fp16_vk_0 = nullptr;
3564
- vk_pipeline to_fp16_vk_1 = nullptr;
3565
-
3566
- if (x_non_contig) {
3567
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
3568
- } else {
3569
- to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
3570
- }
3571
- if (y_non_contig) {
3572
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
3573
- } else {
3574
- to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3575
- }
3576
- GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
3577
- GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
3578
-
3579
- // Allocate descriptor sets
3580
- ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1);
3581
- if (qx_needs_dequant) {
3582
- ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
3583
- }
3584
- if (qy_needs_dequant) {
3585
- ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
3586
- }
3587
-
3588
3804
  if (x_non_contig) {
3589
3805
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
3590
3806
  } else if (qx_needs_dequant) {
3591
3807
  const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
3592
3808
  ggml_vk_sync_buffers(subctx);
3593
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
3809
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
3810
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
3594
3811
  }
3595
3812
  if (y_non_contig) {
3596
3813
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -3618,11 +3835,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3618
3835
  ); // NOLINT
3619
3836
  }
3620
3837
 
3621
- static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3838
+ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) {
3622
3839
  VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3623
3840
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3624
3841
  std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3625
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3842
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
3843
+ std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
3626
3844
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3627
3845
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3628
3846
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -3649,10 +3867,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3649
3867
  const uint64_t ne22 = dst->ne[2];
3650
3868
  const uint64_t ne23 = dst->ne[3];
3651
3869
 
3652
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3653
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3654
- ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
3655
- ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra;
3870
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
3871
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
3872
+ ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
3873
+ ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
3656
3874
 
3657
3875
  vk_buffer d_Qx;
3658
3876
  size_t qx_buf_offset = 0;
@@ -3696,26 +3914,67 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3696
3914
  const uint64_t ids_sz = nbi2;
3697
3915
  const uint64_t d_sz = sizeof(float) * d_ne;
3698
3916
 
3699
- vk_buffer d_D = extra->buffer_gpu.lock();
3700
- const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3917
+ vk_pipeline to_fp16_vk_0 = nullptr;
3918
+ vk_pipeline to_fp16_vk_1 = nullptr;
3919
+ if (x_non_contig) {
3920
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
3921
+ }
3922
+ if (y_non_contig) {
3923
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
3924
+ } else {
3925
+ to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3926
+ }
3927
+ vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type);
3928
+ GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
3929
+ GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
3930
+ GGML_ASSERT(dmmv != nullptr);
3931
+
3932
+ if (dryrun) {
3933
+ const uint64_t x_sz_upd = x_sz * ne02 * ne03;
3934
+ const uint64_t y_sz_upd = y_sz * ne12 * ne13;
3935
+ if (
3936
+ (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
3937
+ (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
3938
+ GGML_ABORT("Requested preallocation size is too large");
3939
+ }
3940
+ if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
3941
+ ctx->prealloc_size_x = x_sz_upd;
3942
+ }
3943
+ if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
3944
+ ctx->prealloc_size_y = y_sz_upd;
3945
+ }
3946
+
3947
+ // Request descriptor sets
3948
+ if (qx_needs_dequant) {
3949
+ ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
3950
+ }
3951
+ if (qy_needs_dequant) {
3952
+ ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
3953
+ }
3954
+ ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
3955
+ return;
3956
+ }
3957
+
3958
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
3959
+ const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
3701
3960
  GGML_ASSERT(d_D != nullptr);
3702
3961
  vk_buffer d_X;
3703
3962
  uint64_t x_buf_offset = 0;
3704
3963
  vk_buffer d_Y;
3705
3964
  uint64_t y_buf_offset = 0;
3706
3965
  if(!src0_uma) {
3707
- d_Qx = extra_src0->buffer_gpu.lock();
3708
- qx_buf_offset = extra_src0->offset + src0->view_offs;
3966
+ d_Qx = src0_buf_ctx->dev_buffer;
3967
+ qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
3709
3968
  GGML_ASSERT(d_Qx != nullptr);
3710
3969
  }
3711
3970
  if(!src1_uma) {
3712
- d_Qy = extra_src1->buffer_gpu.lock();
3713
- qy_buf_offset = extra_src1->offset + src1->view_offs;
3971
+ d_Qy = src1_buf_ctx->dev_buffer;
3972
+ qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
3714
3973
  GGML_ASSERT(d_Qy != nullptr);
3715
3974
  }
3716
3975
  if(!ids_uma) {
3717
- d_ids = extra_ids->buffer_gpu.lock();
3718
- ids_buf_offset = extra_ids->offset + ids->view_offs;
3976
+ d_ids = ids_buf_ctx->dev_buffer;
3977
+ ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
3719
3978
  GGML_ASSERT(d_ids != nullptr);
3720
3979
  }
3721
3980
  if (qx_needs_dequant) {
@@ -3733,30 +3992,6 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3733
3992
  GGML_ASSERT(qy_sz == y_sz);
3734
3993
  }
3735
3994
 
3736
- vk_pipeline to_fp16_vk_0 = nullptr;
3737
- vk_pipeline to_fp16_vk_1 = nullptr;
3738
- if (x_non_contig) {
3739
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
3740
- }
3741
- if (y_non_contig) {
3742
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
3743
- } else {
3744
- to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3745
- }
3746
- vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type);
3747
- GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
3748
- GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
3749
- GGML_ASSERT(dmmv != nullptr);
3750
-
3751
- // Allocate descriptor sets
3752
- if (qx_needs_dequant) {
3753
- ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
3754
- }
3755
- if (qy_needs_dequant) {
3756
- ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
3757
- }
3758
- ggml_pipeline_allocate_descriptor_sets(ctx->device, dmmv, ne12 * ne13);
3759
-
3760
3995
  if (x_non_contig) {
3761
3996
  GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
3762
3997
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
@@ -3790,95 +4025,22 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3790
4025
  };
3791
4026
  ggml_vk_sync_buffers(subctx);
3792
4027
  ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
3793
- { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } },
4028
+ { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
4029
+ vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
3794
4030
  sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
3795
4031
  }
3796
4032
 
3797
- static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4033
+ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
3798
4034
  VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
3799
4035
  if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
3800
- ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
4036
+ ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
3801
4037
  } else {
3802
- ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst);
4038
+ ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
3803
4039
  }
3804
4040
  }
3805
4041
 
3806
- static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3807
- // guaranteed to be an integer due to the check in ggml_can_repeat
3808
- const uint64_t ne0 = dst->ne[0];
3809
- const uint64_t ne1 = dst->ne[1];
3810
- const uint64_t ne2 = dst->ne[2];
3811
- const uint64_t ne3 = dst->ne[3];
3812
-
3813
- const uint64_t ne00 = src0->ne[0];
3814
- const uint64_t ne01 = src0->ne[1];
3815
- const uint64_t ne02 = src0->ne[2];
3816
- const uint64_t ne03 = src0->ne[3];
3817
-
3818
- const uint64_t nb0 = dst->nb[0];
3819
- const uint64_t nb1 = dst->nb[1];
3820
- const uint64_t nb2 = dst->nb[2];
3821
- const uint64_t nb3 = dst->nb[3];
3822
-
3823
- const uint64_t nb00 = src0->nb[0];
3824
- const uint64_t nb01 = src0->nb[1];
3825
- const uint64_t nb02 = src0->nb[2];
3826
- const uint64_t nb03 = src0->nb[3];
3827
-
3828
- const uint64_t nr0 = ne0/ne00;
3829
- const uint64_t nr1 = ne1/ne01;
3830
- const uint64_t nr2 = ne2/ne02;
3831
- const uint64_t nr3 = ne3/ne03;
3832
-
3833
- // TODO: support for transposed / permuted tensors
3834
- GGML_ASSERT(nb0 == sizeof(float));
3835
- GGML_ASSERT(nb00 == sizeof(float));
3836
-
3837
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3838
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3839
-
3840
- const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
3841
- const uint64_t src_offset = extra_src0->offset + src0->view_offs;
3842
- vk_buffer dst_buf = extra->buffer_gpu.lock();
3843
- const uint64_t dst_offset = extra->offset + dst->view_offs;
3844
-
3845
- std::vector<vk::BufferCopy> copies;
3846
-
3847
- for (uint64_t i3 = 0; i3 < nr3; i3++) {
3848
- for (uint64_t k3 = 0; k3 < ne03; k3++) {
3849
- for (uint64_t i2 = 0; i2 < nr2; i2++) {
3850
- for (uint64_t k2 = 0; k2 < ne02; k2++) {
3851
- for (uint64_t i1 = 0; i1 < nr1; i1++) {
3852
- for (uint64_t k1 = 0; k1 < ne01; k1++) {
3853
- for (uint64_t i0 = 0; i0 < nr0; i0++) {
3854
- copies.push_back({
3855
- src_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
3856
- dst_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
3857
- ne00*nb0,
3858
- });
3859
- }
3860
- }
3861
- }
3862
- }
3863
- }
3864
- }
3865
- }
3866
-
3867
- ggml_vk_sync_buffers(subctx);
3868
- subctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies);
3869
-
3870
- GGML_UNUSED(ctx);
3871
- GGML_UNUSED(src1);
3872
- }
3873
-
3874
-
3875
4042
  static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
3876
4043
  switch (op) {
3877
- case GGML_OP_ADD:
3878
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3879
- return ctx->device->pipeline_add_f32;
3880
- }
3881
- return nullptr;
3882
4044
  case GGML_OP_GET_ROWS:
3883
4045
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
3884
4046
  if (dst->type == GGML_TYPE_F16) {
@@ -3888,6 +4050,19 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3888
4050
  return ctx->device->pipeline_get_rows_f32[src0->type];
3889
4051
  }
3890
4052
  return nullptr;
4053
+ case GGML_OP_ACC:
4054
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4055
+ return ctx->device->pipeline_acc_f32;
4056
+ }
4057
+ return nullptr;
4058
+ case GGML_OP_ADD:
4059
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4060
+ return ctx->device->pipeline_add_f32;
4061
+ }
4062
+ if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
4063
+ return ctx->device->pipeline_add_f16_f32_f16;
4064
+ }
4065
+ return nullptr;
3891
4066
  case GGML_OP_MUL:
3892
4067
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3893
4068
  return ctx->device->pipeline_mul_f32;
@@ -3898,6 +4073,22 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3898
4073
  return ctx->device->pipeline_div_f32;
3899
4074
  }
3900
4075
  return nullptr;
4076
+ case GGML_OP_CONCAT:
4077
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4078
+ return ctx->device->pipeline_concat_f32;
4079
+ }
4080
+ if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
4081
+ return ctx->device->pipeline_concat_f16;
4082
+ }
4083
+ if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
4084
+ return ctx->device->pipeline_concat_i32;
4085
+ }
4086
+ return nullptr;
4087
+ case GGML_OP_UPSCALE:
4088
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4089
+ return ctx->device->pipeline_upscale_f32;
4090
+ }
4091
+ return nullptr;
3901
4092
  case GGML_OP_SCALE:
3902
4093
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3903
4094
  return ctx->device->pipeline_scale_f32;
@@ -3908,11 +4099,31 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3908
4099
  return ctx->device->pipeline_sqr_f32;
3909
4100
  }
3910
4101
  return nullptr;
4102
+ case GGML_OP_SIN:
4103
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4104
+ return ctx->device->pipeline_sin_f32;
4105
+ }
4106
+ return nullptr;
4107
+ case GGML_OP_COS:
4108
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4109
+ return ctx->device->pipeline_cos_f32;
4110
+ }
4111
+ return nullptr;
3911
4112
  case GGML_OP_CLAMP:
3912
4113
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3913
4114
  return ctx->device->pipeline_clamp_f32;
3914
4115
  }
3915
4116
  return nullptr;
4117
+ case GGML_OP_PAD:
4118
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4119
+ return ctx->device->pipeline_pad_f32;
4120
+ }
4121
+ return nullptr;
4122
+ case GGML_OP_REPEAT:
4123
+ if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
4124
+ return ctx->device->pipeline_repeat_f32;
4125
+ }
4126
+ return nullptr;
3916
4127
  case GGML_OP_CPY:
3917
4128
  case GGML_OP_CONT:
3918
4129
  case GGML_OP_DUP:
@@ -3922,6 +4133,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3922
4133
  return ctx->device->pipeline_norm_f32;
3923
4134
  }
3924
4135
  return nullptr;
4136
+ case GGML_OP_GROUP_NORM:
4137
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4138
+ return ctx->device->pipeline_group_norm_f32;
4139
+ }
4140
+ return nullptr;
3925
4141
  case GGML_OP_RMS_NORM:
3926
4142
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3927
4143
  return ctx->device->pipeline_rms_norm_f32;
@@ -3939,11 +4155,21 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3939
4155
  return ctx->device->pipeline_gelu_f32;
3940
4156
  }
3941
4157
  break;
4158
+ case GGML_UNARY_OP_GELU_QUICK:
4159
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4160
+ return ctx->device->pipeline_gelu_quick_f32;
4161
+ }
4162
+ break;
3942
4163
  case GGML_UNARY_OP_RELU:
3943
4164
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3944
4165
  return ctx->device->pipeline_relu_f32;
3945
4166
  }
3946
4167
  break;
4168
+ case GGML_UNARY_OP_TANH:
4169
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4170
+ return ctx->device->pipeline_tanh_f32;
4171
+ }
4172
+ break;
3947
4173
  default:
3948
4174
  break;
3949
4175
  }
@@ -3966,7 +4192,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3966
4192
  case GGML_OP_ROPE:
3967
4193
  {
3968
4194
  const int mode = ((const int32_t *) dst->op_params)[2];
3969
- const bool is_neox = mode & 2;
4195
+ const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
3970
4196
 
3971
4197
  if (is_neox) {
3972
4198
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
@@ -3995,6 +4221,24 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3995
4221
  return ctx->device->pipeline_sum_rows_f32;
3996
4222
  }
3997
4223
  return nullptr;
4224
+ case GGML_OP_IM2COL:
4225
+ if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4226
+ return ctx->device->pipeline_im2col_f32;
4227
+ }
4228
+ if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
4229
+ return ctx->device->pipeline_im2col_f32_f16;
4230
+ }
4231
+ return nullptr;
4232
+ case GGML_OP_TIMESTEP_EMBEDDING:
4233
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4234
+ return ctx->device->pipeline_timestep_embedding_f32;
4235
+ }
4236
+ return nullptr;
4237
+ case GGML_OP_LEAKY_RELU:
4238
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4239
+ return ctx->device->pipeline_leaky_relu_f32;
4240
+ }
4241
+ return nullptr;
3998
4242
  default:
3999
4243
  return nullptr;
4000
4244
  }
@@ -4002,15 +4246,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
4002
4246
  GGML_UNUSED(src2);
4003
4247
  }
4004
4248
 
4005
- static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
4006
- switch(op) {
4007
- case GGML_OP_REPEAT:
4008
- return ggml_vk_op_repeat;
4009
- default:
4010
- return nullptr;
4011
- }
4012
- }
4013
-
4014
4249
  static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
4015
4250
  switch (op) {
4016
4251
  case GGML_OP_CPY:
@@ -4018,9 +4253,15 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
4018
4253
  case GGML_OP_ADD:
4019
4254
  case GGML_OP_MUL:
4020
4255
  case GGML_OP_DIV:
4256
+ case GGML_OP_CONCAT:
4257
+ case GGML_OP_UPSCALE:
4021
4258
  case GGML_OP_SCALE:
4022
4259
  case GGML_OP_SQR:
4260
+ case GGML_OP_SIN:
4261
+ case GGML_OP_COS:
4023
4262
  case GGML_OP_CLAMP:
4263
+ case GGML_OP_PAD:
4264
+ case GGML_OP_REPEAT:
4024
4265
  return true;
4025
4266
  default:
4026
4267
  return false;
@@ -4028,7 +4269,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
4028
4269
  }
4029
4270
 
4030
4271
  template<typename PC>
4031
- static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
4272
+ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc, bool dryrun = false) {
4032
4273
  VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
4033
4274
  if (src1 != nullptr) {
4034
4275
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
@@ -4036,10 +4277,11 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4036
4277
  if (src2 != nullptr) {
4037
4278
  std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
4038
4279
  }
4039
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")");
4280
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
4281
+ std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
4040
4282
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
4041
4283
  GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
4042
- GGML_ASSERT(dst->extra != nullptr);
4284
+ GGML_ASSERT(dst->buffer != nullptr);
4043
4285
  const uint64_t ne00 = src0->ne[0];
4044
4286
  const uint64_t ne01 = src0->ne[1];
4045
4287
  const uint64_t ne02 = src0->ne[2];
@@ -4068,29 +4310,27 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4068
4310
  const uint64_t ned = ned0 * ned1;
4069
4311
 
4070
4312
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
4071
- ggml_vk_func_t op_func;
4072
4313
 
4073
4314
  if (pipeline == nullptr) {
4074
- op_func = ggml_vk_op_get_func(op);
4075
- if (op_func == nullptr) {
4076
- std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type);
4077
- if (src1 != nullptr) {
4078
- std::cerr << " and " << ggml_type_name(src1->type);
4079
- }
4080
- std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
4081
- GGML_ABORT("fatal error");
4315
+ std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type);
4316
+ if (src1 != nullptr) {
4317
+ std::cerr << " and " << ggml_type_name(src1->type);
4082
4318
  }
4319
+ std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
4320
+ GGML_ABORT("fatal error");
4321
+ }
4083
4322
 
4084
- op_func(ctx, subctx, src0, src1, dst);
4323
+ if (dryrun) {
4324
+ ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
4085
4325
  return;
4086
4326
  }
4087
4327
 
4088
4328
  const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
4089
4329
 
4090
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4091
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
4092
- ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
4093
- ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
4330
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
4331
+ ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
4332
+ ggml_backend_vk_buffer_context * src1_buf_ctx = use_src1 ? (ggml_backend_vk_buffer_context *)src1->buffer->context : nullptr;
4333
+ ggml_backend_vk_buffer_context * src2_buf_ctx = use_src2 ? (ggml_backend_vk_buffer_context *)src2->buffer->context : nullptr;
4094
4334
 
4095
4335
  vk_buffer d_X = nullptr;
4096
4336
  size_t x_buf_offset = 0;
@@ -4121,29 +4361,29 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4121
4361
  uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0;
4122
4362
  uint64_t d_sz = ggml_type_size(dst->type) * ned;
4123
4363
 
4124
- vk_buffer d_D = extra->buffer_gpu.lock();
4364
+ vk_buffer d_D = dst_buf_ctx->dev_buffer;
4125
4365
 
4126
4366
  // Workaround for tiny tensor inputs on ROPE
4127
- if (use_src1 && y_sz > d_D->size) {
4367
+ if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) {
4128
4368
  y_sz = VK_WHOLE_SIZE;
4129
4369
  }
4130
4370
 
4131
4371
  GGML_ASSERT(d_D != nullptr);
4132
- uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4133
- GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
4372
+ uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4373
+ GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT
4134
4374
  if(!src0_uma) {
4135
- d_X = extra_src0->buffer_gpu.lock();
4136
- x_buf_offset = extra_src0->offset + src0->view_offs;
4375
+ d_X = src0_buf_ctx->dev_buffer;
4376
+ x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
4137
4377
  GGML_ASSERT(d_X != nullptr);
4138
4378
  }
4139
4379
  if (use_src1 && !src1_uma) {
4140
- d_Y = extra_src1->buffer_gpu.lock();
4141
- y_buf_offset = extra_src1->offset + src1->view_offs;
4380
+ d_Y = src1_buf_ctx->dev_buffer;
4381
+ y_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
4142
4382
  GGML_ASSERT(d_Y != nullptr);
4143
4383
  }
4144
4384
  if (use_src2 && !src2_uma) {
4145
- d_Z = extra_src2->buffer_gpu.lock();
4146
- z_buf_offset = extra_src2->offset + src2->view_offs;
4385
+ d_Z = src2_buf_ctx->dev_buffer;
4386
+ z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
4147
4387
  GGML_ASSERT(d_Z != nullptr);
4148
4388
  }
4149
4389
 
@@ -4170,127 +4410,143 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4170
4410
  std::array<uint32_t, 3> elements;
4171
4411
 
4172
4412
  // Single call if dimension 2 is contiguous
4173
- if (op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
4174
- ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1);
4413
+ GGML_ASSERT(op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1))));
4175
4414
 
4176
- switch (dst->op) {
4177
- case GGML_OP_NORM:
4178
- case GGML_OP_RMS_NORM:
4179
- case GGML_OP_SOFT_MAX:
4180
- case GGML_OP_SUM_ROWS:
4181
- elements = { (uint32_t)ggml_nrows(src0), 1, 1 };
4182
- break;
4183
- case GGML_OP_DIAG_MASK_INF:
4184
- case GGML_OP_ROPE:
4185
- elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
4186
- break;
4187
- case GGML_OP_GET_ROWS:
4188
- elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4189
- break;
4190
- case GGML_OP_ARGSORT:
4191
- elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
4192
- break;
4193
- default:
4194
- elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
4195
- break;
4196
- }
4197
-
4198
- if (!op_supports_incontiguous) {
4199
- if (x_sz != VK_WHOLE_SIZE) {
4200
- x_sz *= ne02 * ne03;
4201
- }
4202
- if (use_src1 && y_sz != VK_WHOLE_SIZE) {
4203
- y_sz *= ne12 * ne13;
4204
- }
4205
- if (use_src2 && z_sz != VK_WHOLE_SIZE) {
4206
- z_sz *= ne22 * ne23;
4207
- }
4208
- if (d_sz != VK_WHOLE_SIZE) {
4209
- d_sz *= ned2 * ned3;
4210
- }
4211
- }
4212
-
4213
- if (op == GGML_OP_SOFT_MAX) {
4214
- // Empty src1 is possible in soft_max, but the shader needs a buffer
4215
- vk_subbuffer subbuf_y;
4216
- if (use_src1) {
4217
- subbuf_y = { d_Y, y_buf_offset, y_sz };
4415
+ switch (op) {
4416
+ case GGML_OP_NORM:
4417
+ case GGML_OP_RMS_NORM:
4418
+ case GGML_OP_SOFT_MAX:
4419
+ case GGML_OP_SUM_ROWS:
4420
+ {
4421
+ const uint32_t nr = ggml_nrows(src0);
4422
+ if (nr > 262144) {
4423
+ elements = { 512, 512, CEIL_DIV(nr, 262144) };
4424
+ } else if (nr > 512) {
4425
+ elements = { 512, CEIL_DIV(nr, 512), 1 };
4218
4426
  } else {
4219
- subbuf_y = { d_X, 0, d_X->size };
4427
+ elements = { nr, 1, 1 };
4220
4428
  }
4429
+ } break;
4430
+ case GGML_OP_GROUP_NORM:
4431
+ {
4432
+ const uint32_t num_groups = dst->op_params[0];
4433
+ elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 };
4434
+ } break;
4435
+ case GGML_OP_DIAG_MASK_INF:
4436
+ case GGML_OP_ROPE:
4437
+ elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
4438
+ break;
4439
+ case GGML_OP_GET_ROWS:
4440
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4441
+ break;
4442
+ case GGML_OP_ARGSORT:
4443
+ elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
4444
+ break;
4445
+ case GGML_OP_IM2COL:
4446
+ {
4447
+ const bool is_2D = dst->op_params[6] == 1;
4221
4448
 
4222
- ggml_vk_sync_buffers(subctx);
4223
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4224
- } else if (op == GGML_OP_ROPE) {
4225
- // Empty src2 is possible in rope, but the shader needs a buffer
4226
- vk_subbuffer subbuf_z;
4227
- if (use_src2) {
4228
- subbuf_z = { d_Z, z_buf_offset, z_sz };
4449
+ const uint32_t IC = src1->ne[is_2D ? 2 : 1];
4450
+
4451
+ const uint32_t KH = is_2D ? src0->ne[1] : 1;
4452
+ const uint32_t KW = src0->ne[0];
4453
+
4454
+ const uint32_t OH = is_2D ? dst->ne[2] : 1;
4455
+ const uint32_t OW = dst->ne[1];
4456
+
4457
+ const uint32_t batch = src1->ne[3];
4458
+
4459
+ elements = { OW * KW * KH, OH, batch * IC };
4460
+ } break;
4461
+ case GGML_OP_TIMESTEP_EMBEDDING:
4462
+ {
4463
+ const uint32_t dim = dst->op_params[0];
4464
+ uint32_t half_ceil = (dim + 1) / 2;
4465
+ elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
4466
+ } break;
4467
+ case GGML_OP_ADD:
4468
+ case GGML_OP_DIV:
4469
+ case GGML_OP_MUL:
4470
+ case GGML_OP_SCALE:
4471
+ case GGML_OP_SQR:
4472
+ case GGML_OP_SIN:
4473
+ case GGML_OP_COS:
4474
+ case GGML_OP_CLAMP:
4475
+ case GGML_OP_PAD:
4476
+ case GGML_OP_REPEAT:
4477
+ case GGML_OP_CPY:
4478
+ case GGML_OP_CONCAT:
4479
+ case GGML_OP_UPSCALE:
4480
+ case GGML_OP_UNARY:
4481
+ {
4482
+ const uint32_t ne = ggml_nelements(dst);
4483
+ if (ne > 262144) {
4484
+ elements = { 512, 512, CEIL_DIV(ne, 262144) };
4485
+ } else if (ne > 512) {
4486
+ elements = { 512, CEIL_DIV(ne, 512), 1 };
4229
4487
  } else {
4230
- subbuf_z = { d_X, 0, d_X->size };
4488
+ elements = { ne, 1, 1 };
4231
4489
  }
4490
+ } break;
4491
+ default:
4492
+ elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
4493
+ break;
4494
+ }
4232
4495
 
4233
- ggml_vk_sync_buffers(subctx);
4234
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4235
- } else if (use_src2) {
4236
- ggml_vk_sync_buffers(subctx);
4237
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4238
- } else if (use_src1) {
4239
- ggml_vk_sync_buffers(subctx);
4240
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4241
- } else {
4242
- ggml_vk_sync_buffers(subctx);
4243
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4496
+ if (!op_supports_incontiguous) {
4497
+ if (x_sz != VK_WHOLE_SIZE) {
4498
+ x_sz *= ne02 * ne03;
4244
4499
  }
4245
- } else {
4246
- GGML_ASSERT(op != GGML_OP_SOFT_MAX);
4247
- GGML_ASSERT(op != GGML_OP_ARGSORT);
4248
- GGML_ASSERT(!use_src2);
4249
-
4250
- ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, ne02 * ne03);
4251
-
4252
- switch (dst->op) {
4253
- case GGML_OP_NORM:
4254
- case GGML_OP_RMS_NORM:
4255
- elements = { (uint32_t)ne01, 1, 1 };
4256
- break;
4257
- case GGML_OP_DIAG_MASK_INF:
4258
- case GGML_OP_ROPE:
4259
- elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
4260
- break;
4261
- case GGML_OP_GET_ROWS:
4262
- elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
4263
- break;
4264
- default:
4265
- elements = { (uint32_t)ne0, 1, 1 };
4266
- break;
4500
+ if (use_src1 && y_sz != VK_WHOLE_SIZE) {
4501
+ y_sz *= ne12 * ne13;
4502
+ }
4503
+ if (use_src2 && z_sz != VK_WHOLE_SIZE) {
4504
+ z_sz *= ne22 * ne23;
4505
+ }
4506
+ if (d_sz != VK_WHOLE_SIZE) {
4507
+ d_sz *= ned2 * ned3;
4267
4508
  }
4509
+ }
4268
4510
 
4269
- for (uint64_t i03 = 0; i03 < ne03; i03++) {
4270
- for (uint64_t i02 = 0; i02 < ne02; i02++) {
4271
- const uint32_t it_idx0 = (i03 * ne02 + i02);
4272
- const uint32_t it_idx1 = use_src1 ? ((i03 % ne13) * ne12 + (i02 % ne12)) : 0;
4273
- const uint32_t x_offset = x_sz * it_idx0;
4274
- const uint32_t y_offset = y_sz * it_idx1;
4275
- const uint32_t d_offset = d_sz * it_idx0;
4511
+ if (op == GGML_OP_SOFT_MAX) {
4512
+ // Empty src1 is possible in soft_max, but the shader needs a buffer
4513
+ vk_subbuffer subbuf_y;
4514
+ if (use_src1) {
4515
+ subbuf_y = { d_Y, y_buf_offset, y_sz };
4516
+ } else {
4517
+ subbuf_y = { d_X, 0, x_sz };
4518
+ }
4276
4519
 
4277
- if (use_src1) {
4278
- ggml_vk_sync_buffers(subctx);
4279
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4280
- } else {
4281
- ggml_vk_sync_buffers(subctx);
4282
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4283
- }
4284
- }
4520
+ ggml_vk_sync_buffers(subctx);
4521
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4522
+ } else if (op == GGML_OP_ROPE) {
4523
+ // Empty src2 is possible in rope, but the shader needs a buffer
4524
+ vk_subbuffer subbuf_z;
4525
+ if (use_src2) {
4526
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
4527
+ } else {
4528
+ subbuf_z = { d_X, 0, x_sz };
4285
4529
  }
4286
- }
4287
- }
4288
4530
 
4289
- static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4290
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
4531
+ ggml_vk_sync_buffers(subctx);
4532
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4533
+ } else if (op == GGML_OP_IM2COL) {
4534
+ // im2col uses only src1 and dst buffers
4535
+ ggml_vk_sync_buffers(subctx);
4536
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4537
+ } else if (use_src2) {
4538
+ ggml_vk_sync_buffers(subctx);
4539
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4540
+ } else if (use_src1) {
4541
+ ggml_vk_sync_buffers(subctx);
4542
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4543
+ } else {
4544
+ ggml_vk_sync_buffers(subctx);
4545
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4546
+ }
4291
4547
  }
4292
4548
 
4293
- static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4549
+ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
4294
4550
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4295
4551
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4296
4552
  const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4301,11 +4557,32 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx,
4301
4557
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
4302
4558
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4303
4559
  0,
4304
- 0.0f, 0.0f,
4305
- });
4560
+ 0.0f, 0.0f, 0,
4561
+ }, dryrun);
4306
4562
  }
4307
4563
 
4308
- static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4564
+ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
4565
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
4566
+ const uint32_t src1_type_size = ggml_type_size(src1->type);
4567
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
4568
+ const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4569
+
4570
+ int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
4571
+ int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
4572
+ // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
4573
+ int offset = dst->op_params[3] / 4; // offset in bytes
4574
+
4575
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ACC, {
4576
+ (uint32_t)ggml_nelements(src0),
4577
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
4578
+ (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
4579
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
4580
+ d_offset,
4581
+ 0.0f, 0.0f, offset,
4582
+ }, dryrun);
4583
+ }
4584
+
4585
+ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
4309
4586
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4310
4587
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4311
4588
  const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4316,11 +4593,11 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4316
4593
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
4317
4594
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4318
4595
  0,
4319
- 0.0f, 0.0f,
4320
- });
4596
+ 0.0f, 0.0f, 0,
4597
+ }, dryrun);
4321
4598
  }
4322
4599
 
4323
- static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4600
+ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
4324
4601
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4325
4602
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4326
4603
  const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4331,11 +4608,11 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4331
4608
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
4332
4609
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4333
4610
  0,
4334
- 0.0f, 0.0f,
4335
- });
4611
+ 0.0f, 0.0f, 0,
4612
+ }, dryrun);
4336
4613
  }
4337
4614
 
4338
- static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4615
+ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
4339
4616
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4340
4617
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4341
4618
  const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4346,11 +4623,44 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4346
4623
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
4347
4624
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4348
4625
  0,
4349
- 0.0f, 0.0f,
4350
- });
4626
+ 0.0f, 0.0f, 0,
4627
+ }, dryrun);
4351
4628
  }
4352
4629
 
4353
- static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4630
+ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
4631
+ int * op_params = (int *)dst->op_params;
4632
+
4633
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
4634
+ const uint32_t src1_type_size = ggml_type_size(src1->type);
4635
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
4636
+
4637
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONCAT, {
4638
+ (uint32_t)ggml_nelements(dst),
4639
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4640
+ (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
4641
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4642
+ 0,
4643
+ 0.0f, 0.0f, op_params[0],
4644
+ }, dryrun);
4645
+ }
4646
+
4647
+ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4648
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
4649
+
4650
+ const float sf0 = (float)dst->ne[0] / src0->ne[0];
4651
+ const float sf1 = (float)dst->ne[1] / src0->ne[1];
4652
+ const float sf2 = (float)dst->ne[2] / src0->ne[2];
4653
+ const float sf3 = (float)dst->ne[3] / src0->ne[3];
4654
+
4655
+ ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
4656
+ (uint32_t)ggml_nelements(dst), 0,
4657
+ (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4658
+ (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
4659
+ sf0, sf1, sf2, sf3,
4660
+ }, dryrun);
4661
+ }
4662
+
4663
+ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4354
4664
  float * op_params = (float *)dst->op_params;
4355
4665
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4356
4666
  const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4361,10 +4671,10 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co
4361
4671
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4362
4672
  0,
4363
4673
  op_params[0], 0.0f
4364
- });
4674
+ }, dryrun);
4365
4675
  }
4366
4676
 
4367
- static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4677
+ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4368
4678
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4369
4679
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4370
4680
 
@@ -4374,10 +4684,36 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4374
4684
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4375
4685
  0,
4376
4686
  0.0f, 0.0f,
4377
- });
4687
+ }, dryrun);
4688
+ }
4689
+
4690
+ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4691
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
4692
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
4693
+
4694
+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
4695
+ (uint32_t)ggml_nelements(src0),
4696
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4697
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4698
+ 0,
4699
+ 0.0f, 0.0f,
4700
+ }, dryrun);
4701
+ }
4702
+
4703
+ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4704
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
4705
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
4706
+
4707
+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
4708
+ (uint32_t)ggml_nelements(src0),
4709
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4710
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4711
+ 0,
4712
+ 0.0f, 0.0f,
4713
+ }, dryrun);
4378
4714
  }
4379
4715
 
4380
- static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4716
+ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4381
4717
  float * op_params = (float *)dst->op_params;
4382
4718
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4383
4719
  const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4387,15 +4723,40 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co
4387
4723
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4388
4724
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4389
4725
  0,
4390
- op_params[0], op_params[1],
4391
- });
4726
+ op_params[0], op_params[1],
4727
+ }, dryrun);
4728
+ }
4729
+
4730
+ static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4731
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
4732
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
4733
+
4734
+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, {
4735
+ (uint32_t)ggml_nelements(dst),
4736
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4737
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4738
+ 0,
4739
+ 0.0f, 0.0f,
4740
+ }, dryrun);
4741
+ }
4742
+
4743
+ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4744
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
4745
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
4746
+
4747
+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {
4748
+ (uint32_t)ggml_nelements(dst),
4749
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4750
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4751
+ 0,
4752
+ 0.0f, 0.0f,
4753
+ }, dryrun);
4392
4754
  }
4393
4755
 
4394
- static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4395
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4756
+ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4396
4757
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4397
4758
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4398
- const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4759
+ const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4399
4760
 
4400
4761
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4401
4762
  (uint32_t)ggml_nelements(src0),
@@ -4403,30 +4764,41 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4403
4764
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
4404
4765
  d_offset,
4405
4766
  0.0f, 0.0f,
4406
- });
4767
+ }, dryrun);
4407
4768
  }
4408
4769
 
4409
- static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4770
+ static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4410
4771
  float * op_params = (float *)dst->op_params;
4411
4772
 
4412
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
4773
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
4774
+ }
4775
+
4776
+ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4777
+ const int * int_op_params = (const int *)dst->op_params;
4778
+ const float * float_op_params = (const float *)dst->op_params;
4779
+
4780
+ const uint32_t num_groups = int_op_params[0];
4781
+ const float eps = float_op_params[1];
4782
+ const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
4783
+
4784
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun);
4413
4785
  }
4414
4786
 
4415
- static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4787
+ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4416
4788
  float * op_params = (float *)dst->op_params;
4417
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
4789
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
4418
4790
  }
4419
4791
 
4420
- static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4421
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
4792
+ static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4793
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
4422
4794
  }
4423
4795
 
4424
- static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4796
+ static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4425
4797
  int32_t * op_params = (int32_t *)dst->op_params;
4426
- ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
4798
+ ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
4427
4799
  }
4428
4800
 
4429
- static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4801
+ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
4430
4802
  float * op_params = (float *)dst->op_params;
4431
4803
 
4432
4804
  float scale = op_params[0];
@@ -4448,10 +4820,10 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4448
4820
  scale, max_bias,
4449
4821
  m0, m1,
4450
4822
  n_head_log2,
4451
- });
4823
+ }, dryrun);
4452
4824
  }
4453
4825
 
4454
- static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4826
+ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
4455
4827
  const int n_dims = ((int32_t *) dst->op_params)[1];
4456
4828
  // const int mode = ((int32_t *) dst->op_params)[2];
4457
4829
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -4472,10 +4844,10 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4472
4844
  (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4473
4845
  freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
4474
4846
  src2 != nullptr,
4475
- });
4847
+ }, dryrun);
4476
4848
  }
4477
4849
 
4478
- static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4850
+ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4479
4851
  int32_t * op_params = (int32_t *)dst->op_params;
4480
4852
 
4481
4853
  uint32_t ncols = src0->ne[0];
@@ -4491,11 +4863,60 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
4491
4863
  ncols,
4492
4864
  ncols_pad,
4493
4865
  op_params[0],
4494
- });
4866
+ }, dryrun);
4867
+ }
4868
+
4869
+ static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4870
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }, dryrun);
4871
+ }
4872
+
4873
+ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
4874
+ const int32_t s0 = dst->op_params[0];
4875
+ const int32_t s1 = dst->op_params[1];
4876
+ const int32_t p0 = dst->op_params[2];
4877
+ const int32_t p1 = dst->op_params[3];
4878
+ const int32_t d0 = dst->op_params[4];
4879
+ const int32_t d1 = dst->op_params[5];
4880
+
4881
+ const bool is_2D = dst->op_params[6] == 1;
4882
+
4883
+ const uint32_t IC = src1->ne[is_2D ? 2 : 1];
4884
+ const uint32_t IH = is_2D ? src1->ne[1] : 1;
4885
+ const uint32_t IW = src1->ne[0];
4886
+
4887
+ const uint32_t KH = is_2D ? src0->ne[1] : 1;
4888
+ const uint32_t KW = src0->ne[0];
4889
+
4890
+ const uint32_t OH = is_2D ? dst->ne[2] : 1;
4891
+ const uint32_t OW = dst->ne[1];
4892
+
4893
+ const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
4894
+ const uint32_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
4895
+
4896
+ const uint32_t pelements = OW * KW * KH;
4897
+
4898
+ ggml_vk_op_f32<vk_op_im2col_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL, {
4899
+ batch_offset, offset_delta,
4900
+ IC, IW, IH, OW, OH, KW, KH,
4901
+ pelements,
4902
+ IC * KH * KW,
4903
+ s0, s1, p0, p1, d0, d1,
4904
+ }, dryrun);
4905
+ }
4906
+
4907
+ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4908
+ const uint32_t dim = dst->op_params[0];
4909
+ const uint32_t max_period = dst->op_params[1];
4910
+ const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type);
4911
+
4912
+ ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, {
4913
+ nb1, dim, max_period,
4914
+ }, dryrun);
4495
4915
  }
4496
4916
 
4497
- static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4498
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f });
4917
+ static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
4918
+ const float * op_params = (const float *)dst->op_params;
4919
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
4499
4920
  }
4500
4921
 
4501
4922
  #ifdef GGML_VULKAN_RUN_TESTS
@@ -4641,9 +5062,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4641
5062
  }
4642
5063
  }
4643
5064
 
4644
- ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
5065
+ ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
4645
5066
  if (split_k > 1) {
4646
- ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
5067
+ ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
4647
5068
 
4648
5069
  if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
4649
5070
  // Resize buffer
@@ -4654,6 +5075,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4654
5075
  }
4655
5076
  }
4656
5077
 
5078
+ ggml_pipeline_allocate_descriptor_sets(ctx->device);
5079
+
4657
5080
  vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4658
5081
  vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4659
5082
  vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
@@ -4686,7 +5109,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4686
5109
  ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
4687
5110
  ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
4688
5111
 
4689
- vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5112
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
4690
5113
  for (size_t i = 0; i < num_it; i++) {
4691
5114
  ggml_vk_ctx_begin(ctx->device, subctx);
4692
5115
  ggml_vk_matmul(
@@ -4770,7 +5193,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4770
5193
 
4771
5194
  avg_err /= m * n;
4772
5195
 
4773
- std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms avg_err=" << avg_err << std::endl;
5196
+ double tflops = 2.0*m*n*k*batch*num_it / (time / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
5197
+
5198
+ std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
4774
5199
 
4775
5200
  if (avg_err > 0.1) {
4776
5201
  std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
@@ -4890,14 +5315,16 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
4890
5315
  ggml_vk_quantize_data(x, qx, ne, quant);
4891
5316
  ggml_vk_dequantize_data(qx, x_ref, ne, quant);
4892
5317
 
4893
- ggml_pipeline_allocate_descriptor_sets(ctx->device, p, 1);
5318
+ ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
5319
+
5320
+ ggml_pipeline_allocate_descriptor_sets(ctx->device);
4894
5321
 
4895
5322
  ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
4896
5323
 
4897
- vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5324
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
4898
5325
  ggml_vk_ctx_begin(ctx->device, subctx);
4899
5326
  const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
4900
- ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
5327
+ ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
4901
5328
  ggml_vk_ctx_end(subctx);
4902
5329
 
4903
5330
  auto begin = std::chrono::high_resolution_clock::now();
@@ -5011,9 +5438,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5011
5438
  y[i] = (i % k == i / k) ? 1.0f : 0.0f;
5012
5439
  }
5013
5440
 
5014
- ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
5441
+ ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
5015
5442
  if (split_k > 1) {
5016
- ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
5443
+ ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
5017
5444
 
5018
5445
  if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
5019
5446
  // Resize buffer
@@ -5024,10 +5451,12 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5024
5451
  }
5025
5452
  }
5026
5453
 
5454
+ ggml_pipeline_allocate_descriptor_sets(ctx->device);
5455
+
5027
5456
  ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
5028
5457
  ggml_vk_buffer_write(y_buf, 0, y, y_sz);
5029
5458
 
5030
- vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5459
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5031
5460
  for (size_t i = 0; i < num_it; i++) {
5032
5461
  ggml_vk_ctx_begin(ctx->device, subctx);
5033
5462
  ggml_vk_matmul(
@@ -5091,7 +5520,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5091
5520
 
5092
5521
  avg_err /= m * n;
5093
5522
 
5094
- std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
5523
+ double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
5524
+
5525
+ std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
5095
5526
 
5096
5527
  if (avg_err > 0.01 || std::isnan(avg_err)) {
5097
5528
  std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
@@ -5133,132 +5564,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5133
5564
  }
5134
5565
  #endif
5135
5566
 
5136
- static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
5137
- VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
5138
- ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
5139
- extra->reset();
5140
- tensor->extra = extra;
5141
- return extra;
5142
- }
5143
-
5144
- static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
5145
- VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
5146
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5147
-
5148
- if (extra == nullptr) {
5149
- return;
5150
- }
5151
-
5152
- ggml_tensor * src0 = node->src[0];
5153
- ggml_tensor * src1 = node->src[1];
5154
-
5155
- const bool use_src0 = src0 != nullptr;
5156
- const int64_t ne00 = use_src0 ? src0->ne[0] : 0;
5157
- const int64_t ne01 = use_src0 ? src0->ne[1] : 0;
5158
- const int64_t ne02 = use_src0 ? src0->ne[2] : 0;
5159
- const int64_t ne03 = use_src0 ? src0->ne[3] : 0;
5160
- const bool use_src1 = src1 != nullptr && node->op != GGML_OP_CPY && node->op != GGML_OP_CONT && node->op != GGML_OP_DUP;
5161
- const int64_t ne10 = use_src1 ? src1->ne[0] : 0;
5162
- const int64_t ne11 = use_src1 ? src1->ne[1] : 0;
5163
- const int64_t ne12 = use_src1 ? src1->ne[2] : 0;
5164
- const int64_t ne13 = use_src1 ? src1->ne[3] : 0;
5165
- const int64_t ne20 = node->ne[0];
5166
- const int64_t ne21 = node->ne[1];
5167
- const int64_t ne22 = node->ne[2];
5168
- const int64_t ne23 = node->ne[3];
5169
-
5170
- const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
5171
- const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
5172
-
5173
- const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
5174
- const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
5175
-
5176
- const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
5177
-
5178
- bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
5179
-
5180
- const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
5181
- const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
5182
-
5183
- int split_k;
5184
- if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
5185
- split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
5186
- } else {
5187
- split_k = 1;
5188
- }
5189
- const uint32_t x_ne = ne00 * ne01;
5190
- const uint32_t y_ne = ne10 * ne11;
5191
- const uint32_t d_ne = ne20 * ne21;
5192
-
5193
- const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
5194
- const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
5195
- uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
5196
- const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
5197
-
5198
- if (extra->buffer_gpu.expired()) {
5199
- // Workaround for CPU backend BLAS matmul calls
5200
- extra->buffer_gpu = ggml_vk_create_buffer_temp(ctx, d_sz);
5201
- }
5202
-
5203
- switch (node->op) {
5204
- case GGML_OP_REPEAT:
5205
- case GGML_OP_GET_ROWS:
5206
- case GGML_OP_RESHAPE:
5207
- case GGML_OP_VIEW:
5208
- case GGML_OP_PERMUTE:
5209
- case GGML_OP_TRANSPOSE:
5210
- case GGML_OP_ADD:
5211
- case GGML_OP_SCALE:
5212
- case GGML_OP_SQR:
5213
- case GGML_OP_CLAMP:
5214
- case GGML_OP_CPY:
5215
- case GGML_OP_CONT:
5216
- case GGML_OP_DUP:
5217
- case GGML_OP_MUL:
5218
- case GGML_OP_DIV:
5219
- case GGML_OP_NORM:
5220
- case GGML_OP_RMS_NORM:
5221
- case GGML_OP_DIAG_MASK_INF:
5222
- case GGML_OP_SOFT_MAX:
5223
- case GGML_OP_ROPE:
5224
- case GGML_OP_ARGSORT:
5225
- case GGML_OP_SUM_ROWS:
5226
- break;
5227
- case GGML_OP_UNARY:
5228
- switch (ggml_get_unary_op(node)) {
5229
- case GGML_UNARY_OP_SILU:
5230
- case GGML_UNARY_OP_GELU:
5231
- case GGML_UNARY_OP_RELU:
5232
- break;
5233
- default:
5234
- return;
5235
- }
5236
- break;
5237
- case GGML_OP_MUL_MAT:
5238
- case GGML_OP_MUL_MAT_ID:
5239
- if (ctx->prealloc_size_x < x_sz) {
5240
- ctx->prealloc_size_x = x_sz;
5241
- }
5242
- if (ctx->prealloc_size_y < y_sz) {
5243
- ctx->prealloc_size_y = y_sz;
5244
- }
5245
- if (ctx->prealloc_size_split_k < split_k_size) {
5246
- ctx->prealloc_size_split_k = split_k_size;
5247
- }
5248
- if (ctx->staging_size < x_sz + y_sz) {
5249
- ctx->staging_size = x_sz + y_sz;
5250
- }
5251
- break;
5252
- default:
5253
- return;
5254
- }
5255
- }
5256
-
5257
5567
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5258
5568
  #if defined(GGML_VULKAN_RUN_TESTS)
5259
- ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
5260
- vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
5261
- vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
5262
5569
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
5263
5570
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
5264
5571
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
@@ -5418,28 +5725,19 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5418
5725
  }
5419
5726
  ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_split_k);
5420
5727
  }
5421
- if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
5422
- VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
5423
- // Resize buffer
5424
- if (ctx->staging != nullptr) {
5425
- ggml_vk_destroy_buffer(ctx->staging);
5426
- }
5427
- ctx->staging = ggml_vk_create_buffer_check(ctx->device, ctx->staging_size,
5428
- vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
5429
- vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
5430
- }
5431
5728
  }
5432
5729
 
5433
- static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
5434
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5730
+ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
5435
5731
 
5436
- if (ggml_is_empty(node) || extra == nullptr) {
5437
- return;
5732
+ // Returns true if node has enqueued work into the queue, false otherwise
5733
+ // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
5734
+ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
5735
+ if (ggml_is_empty(node) || !node->buffer) {
5736
+ return false;
5438
5737
  }
5439
5738
 
5440
5739
  VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
5441
5740
  ctx->semaphore_idx = 0;
5442
- ctx->staging_offset = 0;
5443
5741
 
5444
5742
  const ggml_tensor * src0 = node->src[0];
5445
5743
  const ggml_tensor * src1 = node->src[1];
@@ -5452,29 +5750,38 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5452
5750
  case GGML_OP_PERMUTE:
5453
5751
  case GGML_OP_TRANSPOSE:
5454
5752
  case GGML_OP_NONE:
5455
- return;
5753
+ return false;
5456
5754
  case GGML_OP_UNARY:
5457
5755
  switch (ggml_get_unary_op(node)) {
5458
5756
  case GGML_UNARY_OP_SILU:
5459
5757
  case GGML_UNARY_OP_GELU:
5758
+ case GGML_UNARY_OP_GELU_QUICK:
5460
5759
  case GGML_UNARY_OP_RELU:
5760
+ case GGML_UNARY_OP_TANH:
5461
5761
  break;
5462
5762
  default:
5463
- return;
5763
+ return false;
5464
5764
  }
5465
5765
  break;
5466
5766
  case GGML_OP_REPEAT:
5467
5767
  case GGML_OP_GET_ROWS:
5468
5768
  case GGML_OP_ADD:
5769
+ case GGML_OP_ACC:
5469
5770
  case GGML_OP_MUL:
5470
5771
  case GGML_OP_DIV:
5772
+ case GGML_OP_CONCAT:
5773
+ case GGML_OP_UPSCALE:
5471
5774
  case GGML_OP_SCALE:
5472
5775
  case GGML_OP_SQR:
5776
+ case GGML_OP_SIN:
5777
+ case GGML_OP_COS:
5473
5778
  case GGML_OP_CLAMP:
5779
+ case GGML_OP_PAD:
5474
5780
  case GGML_OP_CPY:
5475
5781
  case GGML_OP_CONT:
5476
5782
  case GGML_OP_DUP:
5477
5783
  case GGML_OP_NORM:
5784
+ case GGML_OP_GROUP_NORM:
5478
5785
  case GGML_OP_RMS_NORM:
5479
5786
  case GGML_OP_DIAG_MASK_INF:
5480
5787
  case GGML_OP_SOFT_MAX:
@@ -5483,138 +5790,221 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5483
5790
  case GGML_OP_MUL_MAT_ID:
5484
5791
  case GGML_OP_ARGSORT:
5485
5792
  case GGML_OP_SUM_ROWS:
5793
+ case GGML_OP_IM2COL:
5794
+ case GGML_OP_TIMESTEP_EMBEDDING:
5795
+ case GGML_OP_LEAKY_RELU:
5486
5796
  break;
5487
5797
  default:
5488
5798
  std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
5489
5799
  GGML_ABORT("fatal error");
5490
- return;
5800
+ return false;
5491
5801
  }
5492
5802
 
5493
- if (ctx->compute_ctx == nullptr) {
5494
- ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5495
- ggml_vk_ctx_begin(ctx->device, ctx->compute_ctx);
5803
+ vk_context compute_ctx;
5804
+
5805
+ if (!dryrun) {
5806
+ if (ctx->compute_ctx.expired()) {
5807
+ compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5808
+ ctx->compute_ctx = compute_ctx;
5809
+ ggml_vk_ctx_begin(ctx->device, compute_ctx);
5810
+ } else {
5811
+ compute_ctx = ctx->compute_ctx.lock();
5812
+ }
5496
5813
  }
5497
5814
 
5498
5815
  switch (node->op) {
5499
5816
  case GGML_OP_REPEAT:
5500
- ggml_vk_repeat(ctx, ctx->compute_ctx, src0, src1, node);
5817
+ ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun);
5818
+
5819
+ break;
5820
+ case GGML_OP_ACC:
5821
+ ggml_vk_acc(ctx, compute_ctx, src0, src1, node, dryrun);
5501
5822
 
5502
5823
  break;
5503
5824
  case GGML_OP_GET_ROWS:
5504
- ggml_vk_get_rows(ctx, ctx->compute_ctx, src0, src1, node);
5825
+ ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun);
5505
5826
 
5506
5827
  break;
5507
5828
  case GGML_OP_ADD:
5508
- ggml_vk_add(ctx, ctx->compute_ctx, src0, src1, node);
5829
+ ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun);
5509
5830
 
5510
5831
  break;
5511
5832
  case GGML_OP_MUL:
5512
- ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node);
5833
+ ggml_vk_mul(ctx, compute_ctx, src0, src1, node, dryrun);
5513
5834
 
5514
5835
  break;
5515
5836
  case GGML_OP_DIV:
5516
- ggml_vk_div(ctx, ctx->compute_ctx, src0, src1, node);
5837
+ ggml_vk_div(ctx, compute_ctx, src0, src1, node, dryrun);
5838
+
5839
+ break;
5840
+ case GGML_OP_CONCAT:
5841
+ ggml_vk_concat(ctx, compute_ctx, src0, src1, node, dryrun);
5842
+
5843
+ break;
5844
+ case GGML_OP_UPSCALE:
5845
+ ggml_vk_upscale(ctx, compute_ctx, src0, node, dryrun);
5517
5846
 
5518
5847
  break;
5519
5848
  case GGML_OP_SCALE:
5520
- ggml_vk_scale(ctx, ctx->compute_ctx, src0, node);
5849
+ ggml_vk_scale(ctx, compute_ctx, src0, node, dryrun);
5521
5850
 
5522
5851
  break;
5523
5852
  case GGML_OP_SQR:
5524
- ggml_vk_sqr(ctx, ctx->compute_ctx, src0, node);
5853
+ ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun);
5854
+
5855
+ break;
5856
+ case GGML_OP_SIN:
5857
+ ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
5858
+
5859
+ break;
5860
+ case GGML_OP_COS:
5861
+ ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
5525
5862
 
5526
5863
  break;
5527
5864
  case GGML_OP_CLAMP:
5528
- ggml_vk_clamp(ctx, ctx->compute_ctx, src0, node);
5865
+ ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
5866
+
5867
+ break;
5868
+ case GGML_OP_PAD:
5869
+ ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
5529
5870
 
5530
5871
  break;
5531
5872
  case GGML_OP_CPY:
5532
5873
  case GGML_OP_CONT:
5533
5874
  case GGML_OP_DUP:
5534
- ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
5875
+ ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun);
5535
5876
 
5536
5877
  break;
5537
5878
  case GGML_OP_NORM:
5538
- ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
5879
+ ggml_vk_norm(ctx, compute_ctx, src0, node, dryrun);
5880
+
5881
+ break;
5882
+ case GGML_OP_GROUP_NORM:
5883
+ ggml_vk_group_norm(ctx, compute_ctx, src0, node, dryrun);
5539
5884
 
5540
5885
  break;
5541
5886
  case GGML_OP_RMS_NORM:
5542
- ggml_vk_rms_norm(ctx, ctx->compute_ctx, src0, node);
5887
+ ggml_vk_rms_norm(ctx, compute_ctx, src0, node, dryrun);
5543
5888
 
5544
5889
  break;
5545
5890
  case GGML_OP_UNARY:
5546
5891
  switch (ggml_get_unary_op(node)) {
5547
5892
  case GGML_UNARY_OP_SILU:
5548
5893
  case GGML_UNARY_OP_GELU:
5894
+ case GGML_UNARY_OP_GELU_QUICK:
5549
5895
  case GGML_UNARY_OP_RELU:
5550
- ggml_vk_unary(ctx, ctx->compute_ctx, src0, node);
5896
+ case GGML_UNARY_OP_TANH:
5897
+ ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
5551
5898
  break;
5552
5899
  default:
5553
- return;
5900
+ return false;
5554
5901
  }
5555
5902
  break;
5556
5903
  case GGML_OP_DIAG_MASK_INF:
5557
- ggml_vk_diag_mask_inf(ctx, ctx->compute_ctx, src0, node);
5904
+ ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node, dryrun);
5558
5905
 
5559
5906
  break;
5560
5907
  case GGML_OP_SOFT_MAX:
5561
- ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
5908
+ ggml_vk_soft_max(ctx, compute_ctx, src0, src1, node, dryrun);
5562
5909
 
5563
5910
  break;
5564
5911
  case GGML_OP_ROPE:
5565
- ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
5912
+ ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node, dryrun);
5566
5913
 
5567
5914
  break;
5568
5915
  case GGML_OP_ARGSORT:
5569
- ggml_vk_argsort(ctx, ctx->compute_ctx, src0, node);
5916
+ ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
5570
5917
 
5571
5918
  break;
5572
5919
  case GGML_OP_SUM_ROWS:
5573
- ggml_vk_sum_rows(ctx, ctx->compute_ctx, src0, node);
5920
+ ggml_vk_sum_rows(ctx, compute_ctx, src0, node, dryrun);
5921
+
5922
+ break;
5923
+ case GGML_OP_IM2COL:
5924
+ ggml_vk_im2col(ctx, compute_ctx, src0, src1, node, dryrun);
5925
+
5926
+ break;
5927
+ case GGML_OP_TIMESTEP_EMBEDDING:
5928
+ ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
5929
+
5930
+ break;
5931
+ case GGML_OP_LEAKY_RELU:
5932
+ ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
5574
5933
 
5575
5934
  break;
5576
5935
  case GGML_OP_MUL_MAT:
5577
- ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node);
5936
+ ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node, dryrun);
5578
5937
 
5579
5938
  break;
5580
5939
  case GGML_OP_MUL_MAT_ID:
5581
- ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, src2, node);
5940
+ ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node, dryrun);
5582
5941
 
5583
5942
  break;
5584
5943
  default:
5585
- return;
5944
+ return false;
5586
5945
  }
5587
5946
 
5588
- extra->ctx_idx = ctx->compute_ctx->idx;
5947
+ if (dryrun) {
5948
+ return false;
5949
+ }
5589
5950
 
5590
- #ifdef GGML_VULKAN_CHECK_RESULTS
5951
+ ctx->tensor_ctxs[node_idx] = compute_ctx;
5952
+
5953
+ #if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
5591
5954
  // Force context reset on each node so that each tensor ends up in its own context
5592
5955
  // and can be run and compared to its CPU equivalent separately
5593
5956
  last_node = true;
5594
5957
  #endif
5595
5958
 
5596
- if (last_node) {
5597
- ggml_vk_ctx_end(ctx->compute_ctx);
5598
- ctx->compute_ctx->exit_tensor = node;
5599
- ctx->compute_ctx = nullptr;
5959
+ if (submit || last_node) {
5960
+ ggml_vk_ctx_end(compute_ctx);
5961
+
5962
+ // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
5963
+ if (last_node) {
5964
+ compute_ctx->exit_tensor_idx = node_idx_begin;
5965
+ }
5966
+ else {
5967
+ compute_ctx->exit_tensor_idx = -1;
5968
+ }
5969
+
5970
+ ctx->compute_ctx.reset();
5971
+
5972
+ bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
5973
+ if (!ok) {
5974
+ if (node->op == GGML_OP_UNARY) {
5975
+ std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
5976
+ }
5977
+ else {
5978
+ std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
5979
+ }
5980
+ }
5981
+
5600
5982
  }
5983
+ return true;
5601
5984
  }
5602
5985
 
5603
- static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor){
5604
- ggml_tensor_extra_gpu * extra = nullptr;
5986
+ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
5987
+ ggml_backend_buffer * buf = nullptr;
5605
5988
 
5606
5989
  switch (tensor->op) {
5607
5990
  case GGML_OP_ADD:
5991
+ case GGML_OP_ACC:
5608
5992
  case GGML_OP_GET_ROWS:
5609
5993
  case GGML_OP_MUL:
5610
5994
  case GGML_OP_DIV:
5995
+ case GGML_OP_CONCAT:
5996
+ case GGML_OP_UPSCALE:
5611
5997
  case GGML_OP_SCALE:
5612
5998
  case GGML_OP_SQR:
5999
+ case GGML_OP_SIN:
6000
+ case GGML_OP_COS:
5613
6001
  case GGML_OP_CLAMP:
6002
+ case GGML_OP_PAD:
5614
6003
  case GGML_OP_CPY:
5615
6004
  case GGML_OP_CONT:
5616
6005
  case GGML_OP_DUP:
5617
6006
  case GGML_OP_NORM:
6007
+ case GGML_OP_GROUP_NORM:
5618
6008
  case GGML_OP_RMS_NORM:
5619
6009
  case GGML_OP_DIAG_MASK_INF:
5620
6010
  case GGML_OP_SOFT_MAX:
@@ -5626,15 +6016,21 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
5626
6016
  case GGML_OP_NONE:
5627
6017
  case GGML_OP_ARGSORT:
5628
6018
  case GGML_OP_SUM_ROWS:
5629
- extra = (ggml_tensor_extra_gpu *) tensor->extra;
6019
+ case GGML_OP_IM2COL:
6020
+ case GGML_OP_TIMESTEP_EMBEDDING:
6021
+ case GGML_OP_LEAKY_RELU:
6022
+ case GGML_OP_REPEAT:
6023
+ buf = tensor->buffer;
5630
6024
 
5631
6025
  break;
5632
6026
  case GGML_OP_UNARY:
5633
6027
  switch (ggml_get_unary_op(tensor)) {
5634
6028
  case GGML_UNARY_OP_SILU:
5635
6029
  case GGML_UNARY_OP_GELU:
6030
+ case GGML_UNARY_OP_GELU_QUICK:
5636
6031
  case GGML_UNARY_OP_RELU:
5637
- extra = (ggml_tensor_extra_gpu *) tensor->extra;
6032
+ case GGML_UNARY_OP_TANH:
6033
+ buf = tensor->buffer;
5638
6034
  break;
5639
6035
  default:
5640
6036
  return false;
@@ -5642,45 +6038,57 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
5642
6038
  break;
5643
6039
  case GGML_OP_MUL_MAT:
5644
6040
  case GGML_OP_MUL_MAT_ID:
5645
- extra = (ggml_tensor_extra_gpu *) tensor->extra;
6041
+ buf = tensor->buffer;
5646
6042
 
5647
6043
  break;
5648
6044
  default:
5649
6045
  return false;
5650
6046
  }
5651
6047
 
5652
- if (extra == nullptr) {
6048
+ if (buf == nullptr) {
5653
6049
  return false;
5654
6050
  }
5655
6051
 
5656
6052
  VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
5657
6053
 
5658
- #ifdef GGML_VULKAN_CHECK_RESULTS
5659
- ggml_vk_check_results_0(ctx, tensor);
5660
- #endif
6054
+ vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
5661
6055
 
5662
- vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
6056
+ // always wait for the GPU work to be done for the last submit
6057
+ if (tensor_idx == subctx->exit_tensor_idx) {
6058
+ use_fence = true;
6059
+ }
5663
6060
 
5664
6061
  // Only run if ctx hasn't been submitted yet
5665
- if (!subctx.seqs.empty()) {
6062
+ if (!subctx->seqs.empty()) {
6063
+ #ifdef GGML_VULKAN_CHECK_RESULTS
6064
+ ggml_vk_check_results_0(tensor);
6065
+ use_fence = true;
6066
+ #endif
6067
+
5666
6068
  // Do staging buffer copies
5667
- for (auto& cpy : subctx.in_memcpys) {
6069
+ for (auto& cpy : subctx->in_memcpys) {
5668
6070
  memcpy(cpy.dst, cpy.src, cpy.n);
5669
6071
  }
5670
6072
 
5671
- ggml_vk_submit(&subctx, ctx->fence);
5672
- }
6073
+ ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
6074
+
6075
+ if (use_fence) {
6076
+ VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
5673
6077
 
5674
- if (tensor == subctx.exit_tensor) {
5675
- VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
5676
- ctx->device->device.resetFences({ ctx->fence });
6078
+ ctx->device->device.resetFences({ ctx->fence });
6079
+ }
6080
+ #ifdef GGML_VULKAN_CHECK_RESULTS
6081
+ ggml_vk_check_results_1(tensor);
6082
+ #endif
6083
+ }
5677
6084
 
6085
+ if (tensor_idx == subctx->exit_tensor_idx) {
5678
6086
  // Do staging buffer copies
5679
- for (auto& cpy : subctx.out_memcpys) {
6087
+ for (auto& cpy : subctx->out_memcpys) {
5680
6088
  memcpy(cpy.dst, cpy.src, cpy.n);
5681
6089
  }
5682
- subctx.in_memcpys.clear();
5683
- subctx.out_memcpys.clear();
6090
+ subctx->in_memcpys.clear();
6091
+ subctx->out_memcpys.clear();
5684
6092
  }
5685
6093
 
5686
6094
  return true;
@@ -5694,12 +6102,14 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5694
6102
  }
5695
6103
  ctx->gc.temp_buffers.clear();
5696
6104
 
5697
- for (auto& pipeline : ctx->device->pipelines) {
5698
- if (pipeline.expired()) {
6105
+ for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) {
6106
+ vk_pipeline_ref plr = ctx->device->pipelines[dsr.first];
6107
+
6108
+ if (plr.expired()) {
5699
6109
  continue;
5700
6110
  }
5701
6111
 
5702
- vk_pipeline pl = pipeline.lock();
6112
+ vk_pipeline pl = plr.lock();
5703
6113
  ggml_pipeline_cleanup(pl);
5704
6114
  }
5705
6115
 
@@ -5723,11 +6133,9 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5723
6133
  ctx->device->device.resetEvent(event);
5724
6134
  }
5725
6135
 
5726
- ctx->staging_offset = 0;
5727
-
5728
- ctx->compute_ctx = nullptr;
5729
- ctx->transfer_ctx = nullptr;
6136
+ ctx->tensor_ctxs.clear();
5730
6137
  ctx->gc.contexts.clear();
6138
+ ctx->device->pipeline_descriptor_set_requirements.clear();
5731
6139
  }
5732
6140
 
5733
6141
  // Clean up on backend free
@@ -5738,7 +6146,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5738
6146
  ggml_vk_destroy_buffer(ctx->prealloc_x);
5739
6147
  ggml_vk_destroy_buffer(ctx->prealloc_y);
5740
6148
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
5741
- ggml_vk_destroy_buffer(ctx->staging);
5742
6149
 
5743
6150
  for (auto& buffer : ctx->buffer_pool) {
5744
6151
  ggml_vk_destroy_buffer(buffer);
@@ -5747,7 +6154,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5747
6154
  ctx->prealloc_size_x = 0;
5748
6155
  ctx->prealloc_size_y = 0;
5749
6156
  ctx->prealloc_size_split_k = 0;
5750
- ctx->staging_size = 0;
5751
6157
 
5752
6158
  for (auto& event : ctx->gc.events) {
5753
6159
  ctx->device->device.destroyEvent(event);
@@ -5757,13 +6163,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5757
6163
  ctx->device->device.destroyFence(ctx->fence);
5758
6164
  }
5759
6165
 
5760
- GGML_CALL static int ggml_vk_get_device_count() {
6166
+ static int ggml_vk_get_device_count() {
5761
6167
  ggml_vk_instance_init();
5762
6168
 
5763
6169
  return vk_instance.device_indices.size();
5764
6170
  }
5765
6171
 
5766
- GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
6172
+ static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
5767
6173
  ggml_vk_instance_init();
5768
6174
 
5769
6175
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -5780,111 +6186,61 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
5780
6186
 
5781
6187
  // device backend
5782
6188
 
5783
- static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
5784
-
5785
- struct ggml_backend_vk_buffer_context {
5786
- vk_device_ref device;
5787
- vk_buffer dev_buffer;
5788
- ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
5789
- size_t temp_tensor_extra_index = 0;
5790
- std::string name;
5791
-
5792
- ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) :
5793
- device(device),
5794
- dev_buffer(dev_buffer),
5795
- name(name) {
5796
- }
5797
-
5798
- ~ggml_backend_vk_buffer_context() {
5799
- ggml_vk_destroy_buffer(dev_buffer);
5800
- if (temp_tensor_extras != nullptr) {
5801
- delete[] temp_tensor_extras;
5802
- }
5803
- }
5804
-
5805
- ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
5806
- if (temp_tensor_extras == nullptr) {
5807
- temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_VK_MAX_NODES];
5808
- }
5809
-
5810
- size_t alloc_index = temp_tensor_extra_index;
5811
- temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES;
5812
- ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
5813
- extra->reset();
5814
-
5815
- return extra;
5816
- }
5817
- };
5818
-
5819
- GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
6189
+ static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
5820
6190
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5821
6191
  return ctx->name.c_str();
5822
6192
  }
5823
6193
 
5824
- GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
6194
+ static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
5825
6195
  return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
5826
6196
  }
5827
6197
 
5828
- GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6198
+ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
5829
6199
  VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
5830
6200
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5831
6201
  ggml_vk_destroy_buffer(ctx->dev_buffer);
5832
6202
  delete ctx;
5833
6203
  }
5834
6204
 
5835
- GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
6205
+ static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
5836
6206
  return vk_ptr_base;
5837
6207
 
5838
6208
  UNUSED(buffer);
5839
6209
  }
5840
6210
 
5841
- GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
6211
+ static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
5842
6212
  VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
5843
- ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5844
-
5845
6213
  if (tensor->view_src != nullptr) {
5846
6214
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
5847
- GGML_ASSERT(tensor->view_src->extra != nullptr);
5848
- tensor->extra = tensor->view_src->extra;
5849
- } else {
5850
- ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
5851
- extra->buffer_gpu = ctx->dev_buffer;
5852
- extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5853
- tensor->extra = extra;
5854
6215
  }
5855
6216
  }
5856
6217
 
5857
- GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6218
+ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
5858
6219
  VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
5859
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
5860
-
5861
- vk_buffer buf = extra->buffer_gpu.lock();
5862
-
5863
- ggml_vk_buffer_write(buf, extra->offset + tensor->view_offs + offset, data, size);
6220
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6221
+ vk_buffer buf = buf_ctx->dev_buffer;
5864
6222
 
5865
- GGML_UNUSED(buffer);
6223
+ ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
5866
6224
  }
5867
6225
 
5868
- GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6226
+ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
5869
6227
  VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
5870
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6228
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5871
6229
 
5872
- vk_buffer buf = extra->buffer_gpu.lock();
6230
+ vk_buffer buf = buf_ctx->dev_buffer;
5873
6231
 
5874
- ggml_vk_buffer_read(buf, extra->offset + tensor->view_offs + offset, data, size);
5875
-
5876
- GGML_UNUSED(buffer);
6232
+ ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
5877
6233
  }
5878
6234
 
5879
- GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
6235
+ static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
5880
6236
  if (ggml_backend_buffer_is_vk(src->buffer)) {
5881
- ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
5882
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6237
+ ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
6238
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
5883
6239
 
5884
- vk_buffer src_buf = src_extra->buffer_gpu.lock();
5885
- vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6240
+ vk_buffer src_buf = src_buf_ctx->dev_buffer;
6241
+ vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
5886
6242
 
5887
- ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6243
+ ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
5888
6244
 
5889
6245
  return true;
5890
6246
  }
@@ -5893,7 +6249,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
5893
6249
  UNUSED(buffer);
5894
6250
  }
5895
6251
 
5896
- GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
6252
+ static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
5897
6253
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5898
6254
 
5899
6255
  ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
@@ -5904,6 +6260,7 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
5904
6260
  /* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
5905
6261
  /* .get_base = */ ggml_backend_vk_buffer_get_base,
5906
6262
  /* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
6263
+ /* .memset_tensor = */ NULL,
5907
6264
  /* .set_tensor = */ ggml_backend_vk_buffer_set_tensor,
5908
6265
  /* .get_tensor = */ ggml_backend_vk_buffer_get_tensor,
5909
6266
  /* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor,
@@ -5912,13 +6269,13 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
5912
6269
  };
5913
6270
 
5914
6271
  // vk buffer type
5915
- GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) {
6272
+ static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) {
5916
6273
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
5917
6274
 
5918
6275
  return ctx->name.c_str();
5919
6276
  }
5920
6277
 
5921
- GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6278
+ static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
5922
6279
  VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
5923
6280
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
5924
6281
 
@@ -5934,23 +6291,23 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
5934
6291
  return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size);
5935
6292
  }
5936
6293
 
5937
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
6294
+ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
5938
6295
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
5939
6296
  return ctx->device->properties.limits.minStorageBufferOffsetAlignment;
5940
6297
  }
5941
6298
 
5942
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
6299
+ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
5943
6300
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
5944
6301
  return ctx->device->max_memory_allocation_size;
5945
6302
  }
5946
6303
 
5947
- GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
6304
+ static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
5948
6305
  return ggml_nbytes(tensor);
5949
6306
 
5950
6307
  UNUSED(buft);
5951
6308
  }
5952
6309
 
5953
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6310
+ ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
5954
6311
  ggml_vk_instance_init();
5955
6312
 
5956
6313
  VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
@@ -5962,24 +6319,24 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num)
5962
6319
 
5963
6320
  // host buffer type
5964
6321
 
5965
- GGML_CALL static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
6322
+ static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
5966
6323
  return GGML_VK_NAME "_Host";
5967
6324
 
5968
6325
  UNUSED(buft);
5969
6326
  }
5970
6327
 
5971
- GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) {
6328
+ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) {
5972
6329
  return GGML_VK_NAME "_Host";
5973
6330
 
5974
6331
  UNUSED(buffer);
5975
6332
  }
5976
6333
 
5977
- GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6334
+ static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
5978
6335
  VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
5979
6336
  ggml_vk_host_free(vk_instance.devices[0], buffer->context);
5980
6337
  }
5981
6338
 
5982
- GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6339
+ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
5983
6340
  VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
5984
6341
 
5985
6342
  size += 32; // Behave like the CPU buffer type
@@ -6003,7 +6360,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
6003
6360
  UNUSED(buft);
6004
6361
  }
6005
6362
 
6006
- GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
6363
+ static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
6007
6364
  return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment;
6008
6365
 
6009
6366
  UNUSED(buft);
@@ -6011,7 +6368,7 @@ GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_back
6011
6368
 
6012
6369
  // Should be changed to return device-specific host buffer type
6013
6370
  // but that probably requires changes in llama.cpp
6014
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6371
+ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6015
6372
  static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
6016
6373
  /* .iface = */ {
6017
6374
  /* .get_name = */ ggml_backend_vk_host_buffer_type_name,
@@ -6021,6 +6378,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6021
6378
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
6022
6379
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
6023
6380
  },
6381
+ /* .device = */ nullptr,
6024
6382
  /* .context = */ nullptr,
6025
6383
  };
6026
6384
 
@@ -6034,13 +6392,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6034
6392
 
6035
6393
  // backend
6036
6394
 
6037
- GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
6395
+ static const char * ggml_backend_vk_name(ggml_backend_t backend) {
6038
6396
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6039
6397
 
6040
6398
  return ctx->name.c_str();
6041
6399
  }
6042
6400
 
6043
- GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
6401
+ static void ggml_backend_vk_free(ggml_backend_t backend) {
6044
6402
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6045
6403
  VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
6046
6404
 
@@ -6050,107 +6408,125 @@ GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
6050
6408
  delete backend;
6051
6409
  }
6052
6410
 
6053
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) {
6411
+ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) {
6054
6412
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6055
6413
 
6056
6414
  return &ctx->device->buffer_type;
6057
6415
  }
6058
6416
 
6059
- GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6417
+ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6060
6418
  VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
6061
6419
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6062
6420
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6063
6421
 
6064
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6422
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
6423
+
6424
+ vk_context transfer_ctx;
6065
6425
 
6066
- if (ctx->transfer_ctx == nullptr) {
6426
+ if (ctx->transfer_ctx.expired()) {
6067
6427
  // Initialize new transfer context
6068
- ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
6069
- ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx);
6428
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
6429
+ ctx->transfer_ctx = transfer_ctx;
6430
+ ggml_vk_ctx_begin(ctx->device, transfer_ctx);
6431
+ } else {
6432
+ transfer_ctx = ctx->transfer_ctx.lock();
6070
6433
  }
6071
6434
 
6072
- vk_buffer buf = extra->buffer_gpu.lock();
6435
+ vk_buffer buf = buf_ctx->dev_buffer;
6073
6436
 
6074
- ggml_vk_buffer_write_async(ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset);
6437
+ ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6075
6438
  }
6076
6439
 
6077
- GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6440
+ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6078
6441
  VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
6079
6442
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6080
6443
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6081
6444
 
6082
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6445
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
6446
+
6447
+ vk_context transfer_ctx;
6083
6448
 
6084
- if (ctx->transfer_ctx == nullptr) {
6449
+ if (ctx->transfer_ctx.expired()) {
6085
6450
  // Initialize new transfer context
6086
- ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
6087
- ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx);
6451
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
6452
+ ctx->transfer_ctx = transfer_ctx;
6453
+ ggml_vk_ctx_begin(ctx->device, transfer_ctx);
6454
+ } else {
6455
+ transfer_ctx = ctx->transfer_ctx.lock();
6088
6456
  }
6089
6457
 
6090
- vk_buffer buf = extra->buffer_gpu.lock();
6458
+ vk_buffer buf = buf_ctx->dev_buffer;
6091
6459
 
6092
- ggml_vk_buffer_read_async(ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset);
6460
+ ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
6093
6461
  }
6094
6462
 
6095
- GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6463
+ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6096
6464
  VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
6097
6465
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6098
6466
  if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
6099
- ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
6100
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6467
+ ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
6468
+ ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
6469
+
6470
+ vk_context transfer_ctx;
6101
6471
 
6102
- if (ctx->transfer_ctx == nullptr) {
6472
+ if (ctx->transfer_ctx.expired()) {
6103
6473
  // Initialize new transfer context
6104
- ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
6105
- ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx);
6474
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
6475
+ ctx->transfer_ctx = transfer_ctx;
6476
+ ggml_vk_ctx_begin(ctx->device, transfer_ctx);
6477
+ } else {
6478
+ transfer_ctx = ctx->transfer_ctx.lock();
6106
6479
  }
6107
6480
 
6108
- vk_buffer src_buf = src_extra->buffer_gpu.lock();
6109
- vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6481
+ vk_buffer src_buf = src_buf_ctx->dev_buffer;
6482
+ vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
6110
6483
 
6111
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6484
+ ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
6112
6485
  return true;
6113
6486
  }
6114
6487
 
6115
6488
  return false;
6116
6489
  }
6117
6490
 
6118
- GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6491
+ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6119
6492
  VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
6120
6493
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6121
- if(ctx->transfer_ctx == nullptr) {
6494
+ if(ctx->transfer_ctx.expired()) {
6122
6495
  return;
6123
6496
  }
6124
6497
 
6125
- ggml_vk_ctx_end(ctx->transfer_ctx);
6498
+ vk_context transfer_ctx = ctx->transfer_ctx.lock();
6499
+
6500
+ ggml_vk_ctx_end(transfer_ctx);
6126
6501
 
6127
- for (auto& cpy : ctx->transfer_ctx->in_memcpys) {
6502
+ for (auto& cpy : transfer_ctx->in_memcpys) {
6128
6503
  memcpy(cpy.dst, cpy.src, cpy.n);
6129
6504
  }
6130
6505
 
6131
- ggml_vk_submit(ctx->transfer_ctx, ctx->fence);
6506
+ ggml_vk_submit(transfer_ctx, ctx->fence);
6132
6507
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
6133
6508
  ctx->device->device.resetFences({ ctx->fence });
6134
6509
 
6135
- for (auto& cpy : ctx->transfer_ctx->out_memcpys) {
6510
+ for (auto& cpy : transfer_ctx->out_memcpys) {
6136
6511
  memcpy(cpy.dst, cpy.src, cpy.n);
6137
6512
  }
6138
6513
 
6139
- ctx->transfer_ctx = nullptr;
6514
+ ctx->transfer_ctx.reset();
6140
6515
  }
6141
6516
 
6142
6517
  static bool ggml_vk_is_empty(ggml_tensor * node) {
6143
6518
  return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
6144
6519
  }
6145
6520
 
6146
- GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6521
+ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6147
6522
  VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
6148
6523
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6149
6524
 
6150
6525
  for (int i = 0; i < cgraph->n_nodes; i++) {
6151
- ggml_vk_preallocate_buffers_graph(ctx, cgraph->nodes[i]);
6526
+ ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
6152
6527
  }
6153
6528
  ggml_vk_preallocate_buffers(ctx);
6529
+ ggml_pipeline_allocate_descriptor_sets(ctx->device);
6154
6530
 
6155
6531
  int last_node = cgraph->n_nodes - 1;
6156
6532
 
@@ -6159,29 +6535,45 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6159
6535
  last_node -= 1;
6160
6536
  }
6161
6537
 
6162
- for (int i = 0; i < cgraph->n_nodes; i++) {
6163
- ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node);
6164
- }
6538
+ // Reserve tensor context space for all nodes
6539
+ ctx->tensor_ctxs.resize(cgraph->n_nodes);
6165
6540
 
6166
- for (int i = 0; i < cgraph->n_nodes; i++) {
6167
- ggml_tensor * node = cgraph->nodes[i];
6541
+ bool first_node_in_batch = true; // true if next node will be first node in a batch
6542
+ int submit_node_idx = 0; // index to first node in a batch
6168
6543
 
6169
- if (ggml_vk_is_empty(node)) {
6170
- continue;
6544
+ // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
6545
+ constexpr int submit_count = 100;
6546
+ int submitted_nodes = 0;
6547
+ for (int i = 0; i < cgraph->n_nodes; i++) {
6548
+ if (first_node_in_batch) {
6549
+ submit_node_idx = i;
6171
6550
  }
6172
6551
 
6173
- bool ok = ggml_vk_compute_forward(ctx, node);
6174
- if (!ok) {
6175
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
6552
+ bool submit = (submitted_nodes >= submit_count) || (i == last_node);
6553
+
6554
+
6555
+ bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
6556
+
6557
+ if (enqueued) {
6558
+ ++submitted_nodes;
6559
+
6560
+ #ifndef GGML_VULKAN_CHECK_RESULTS
6561
+ if (first_node_in_batch) {
6562
+ first_node_in_batch = false;
6563
+ }
6564
+ #endif
6176
6565
  }
6177
- #ifdef GGML_VULKAN_CHECK_RESULTS
6178
- else {
6179
- ggml_vk_check_results_1(ctx, node);
6566
+
6567
+ if (submit) {
6568
+ first_node_in_batch = true;
6569
+ submitted_nodes = 0;
6180
6570
  }
6181
- #endif
6182
- GGML_ASSERT(ok);
6183
6571
  }
6184
6572
 
6573
+ #ifdef GGML_VULKAN_PERF
6574
+ ctx->device->perf_logger->print_timings();
6575
+ #endif
6576
+
6185
6577
  ggml_vk_graph_cleanup(ctx);
6186
6578
 
6187
6579
  return GGML_STATUS_SUCCESS;
@@ -6189,15 +6581,17 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6189
6581
  UNUSED(backend);
6190
6582
  }
6191
6583
 
6192
- GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
6584
+ static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
6193
6585
  // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
6194
6586
 
6195
6587
  switch (op->op) {
6196
6588
  case GGML_OP_UNARY:
6197
6589
  switch (ggml_get_unary_op(op)) {
6198
6590
  case GGML_UNARY_OP_GELU:
6591
+ case GGML_UNARY_OP_GELU_QUICK:
6199
6592
  case GGML_UNARY_OP_SILU:
6200
6593
  case GGML_UNARY_OP_RELU:
6594
+ case GGML_UNARY_OP_TANH:
6201
6595
  return ggml_is_contiguous(op->src[0]);
6202
6596
  default:
6203
6597
  return false;
@@ -6254,6 +6648,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6254
6648
  return false;
6255
6649
  }
6256
6650
  } break;
6651
+ case GGML_OP_CONT:
6257
6652
  case GGML_OP_CPY:
6258
6653
  case GGML_OP_DUP:
6259
6654
  {
@@ -6270,11 +6665,8 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6270
6665
  }
6271
6666
  return false;
6272
6667
  } break;
6273
- // case GGML_OP_REPEAT:
6274
- // {
6275
- // ggml_type src0_type = op->src[0]->type;
6276
- // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
6277
- // } break;
6668
+ case GGML_OP_REPEAT:
6669
+ return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
6278
6670
  case GGML_OP_ROPE:
6279
6671
  return ggml_is_contiguous(op->src[0]);
6280
6672
  case GGML_OP_NONE:
@@ -6283,18 +6675,27 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6283
6675
  case GGML_OP_PERMUTE:
6284
6676
  case GGML_OP_TRANSPOSE:
6285
6677
  case GGML_OP_NORM:
6678
+ case GGML_OP_GROUP_NORM:
6679
+ case GGML_OP_RMS_NORM:
6286
6680
  case GGML_OP_ADD:
6681
+ case GGML_OP_ACC:
6287
6682
  case GGML_OP_MUL:
6288
6683
  case GGML_OP_DIV:
6289
- case GGML_OP_RMS_NORM:
6684
+ case GGML_OP_CONCAT:
6685
+ case GGML_OP_UPSCALE:
6290
6686
  case GGML_OP_SCALE:
6291
6687
  case GGML_OP_SQR:
6688
+ case GGML_OP_SIN:
6689
+ case GGML_OP_COS:
6292
6690
  case GGML_OP_CLAMP:
6293
- case GGML_OP_CONT:
6691
+ case GGML_OP_PAD:
6294
6692
  case GGML_OP_DIAG_MASK_INF:
6295
6693
  case GGML_OP_SOFT_MAX:
6296
6694
  case GGML_OP_ARGSORT:
6297
6695
  case GGML_OP_SUM_ROWS:
6696
+ case GGML_OP_IM2COL:
6697
+ case GGML_OP_TIMESTEP_EMBEDDING:
6698
+ case GGML_OP_LEAKY_RELU:
6298
6699
  return true;
6299
6700
  default:
6300
6701
  return false;
@@ -6303,7 +6704,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6303
6704
  UNUSED(backend);
6304
6705
  }
6305
6706
 
6306
- GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
6707
+ static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
6307
6708
  const int min_batch_size = 32;
6308
6709
 
6309
6710
  return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
@@ -6312,7 +6713,7 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
6312
6713
  UNUSED(backend);
6313
6714
  }
6314
6715
 
6315
- GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
6716
+ static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
6316
6717
  if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
6317
6718
  return false;
6318
6719
  }
@@ -6340,11 +6741,8 @@ static ggml_backend_i ggml_backend_vk_interface = {
6340
6741
  /* .supports_op = */ ggml_backend_vk_supports_op,
6341
6742
  /* .supports_buft = */ ggml_backend_vk_supports_buft,
6342
6743
  /* .offload_op = */ ggml_backend_vk_offload_op,
6343
- /* .event_new = */ NULL,
6344
- /* .event_free = */ NULL,
6345
6744
  /* .event_record = */ NULL,
6346
6745
  /* .event_wait = */ NULL,
6347
- /* .event_synchronize = */ NULL,
6348
6746
  };
6349
6747
 
6350
6748
  static ggml_guid_t ggml_backend_vk_guid() {
@@ -6352,7 +6750,7 @@ static ggml_guid_t ggml_backend_vk_guid() {
6352
6750
  return &guid;
6353
6751
  }
6354
6752
 
6355
- GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6753
+ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6356
6754
  VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
6357
6755
 
6358
6756
  ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
@@ -6361,25 +6759,26 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6361
6759
  ggml_backend_t vk_backend = new ggml_backend {
6362
6760
  /* .guid = */ ggml_backend_vk_guid(),
6363
6761
  /* .interface = */ ggml_backend_vk_interface,
6762
+ /* .device = */ nullptr,
6364
6763
  /* .context = */ ctx,
6365
6764
  };
6366
6765
 
6367
6766
  return vk_backend;
6368
6767
  }
6369
6768
 
6370
- GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
6769
+ bool ggml_backend_is_vk(ggml_backend_t backend) {
6371
6770
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
6372
6771
  }
6373
6772
 
6374
- GGML_CALL int ggml_backend_vk_get_device_count() {
6773
+ int ggml_backend_vk_get_device_count() {
6375
6774
  return ggml_vk_get_device_count();
6376
6775
  }
6377
6776
 
6378
- GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
6777
+ void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
6379
6778
  ggml_vk_get_device_description(device, description, description_size);
6380
6779
  }
6381
6780
 
6382
- GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
6781
+ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
6383
6782
  GGML_ASSERT(device < (int) vk_instance.device_indices.size());
6384
6783
 
6385
6784
  vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
@@ -6395,27 +6794,6 @@ GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size
6395
6794
  }
6396
6795
  }
6397
6796
 
6398
- // backend registry
6399
- GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) {
6400
- ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data);
6401
- return vk_backend;
6402
-
6403
- UNUSED(params);
6404
- }
6405
-
6406
- extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
6407
-
6408
- GGML_CALL int ggml_backend_vk_reg_devices() {
6409
- ggml_vk_instance_init();
6410
-
6411
- for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
6412
- char name[128];
6413
- snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
6414
- ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
6415
- }
6416
- return vk_instance.device_indices.size();
6417
- }
6418
-
6419
6797
  // Extension availability
6420
6798
  static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
6421
6799
  #ifdef GGML_VULKAN_VALIDATE
@@ -6509,17 +6887,19 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6509
6887
  }
6510
6888
  }
6511
6889
 
6512
- static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
6890
+ static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
6513
6891
  void * tensor_data = tensor->data;
6514
6892
 
6515
- if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6893
+ const bool is_gpu = tensor->buffer != nullptr && ggml_backend_buffer_is_vk(tensor->buffer);
6894
+
6895
+ if (is_gpu) {
6516
6896
  const size_t tensor_size = ggml_nbytes(tensor);
6517
6897
  tensor_data = malloc(tensor_size);
6518
6898
 
6519
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6899
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
6520
6900
 
6521
- vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6522
- ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
6901
+ vk_buffer buffer_gpu = buf_ctx->dev_buffer;
6902
+ ggml_vk_buffer_read(buffer_gpu, vk_tensor_offset(tensor) + tensor->view_offs, tensor_data, tensor_size);
6523
6903
  }
6524
6904
 
6525
6905
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@@ -6533,13 +6913,10 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6533
6913
  std::cerr << std::endl << "Result:" << std::endl;
6534
6914
  ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
6535
6915
  std::cerr << std::endl;
6536
- std::cerr << std::endl << "Result:" << std::endl;
6537
- ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
6538
- std::cerr << std::endl;
6539
6916
  std::vector<const ggml_tensor *> done;
6540
6917
  ggml_vk_print_graph_origin(tensor, done);
6541
6918
 
6542
- if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6919
+ if (is_gpu) {
6543
6920
  free(tensor_data);
6544
6921
  }
6545
6922
  }
@@ -6548,8 +6925,8 @@ void * comp_result;
6548
6925
  size_t comp_size;
6549
6926
  size_t comp_nb[GGML_MAX_DIMS];
6550
6927
  size_t check_counter = 0;
6551
- static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor) {
6552
- if (tensor->op == GGML_OP_TRANSPOSE) {
6928
+ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
6929
+ if (tensor->op == GGML_OP_TRANSPOSE) {
6553
6930
  return;
6554
6931
  }
6555
6932
 
@@ -6565,7 +6942,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6565
6942
  ggml_tensor * src2 = tensor->src[2];
6566
6943
 
6567
6944
  struct ggml_init_params iparams = {
6568
- /*.mem_size =*/ 1024*1024*1024,
6945
+ /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul,
6569
6946
  /*.mem_buffer =*/ NULL,
6570
6947
  /*.no_alloc =*/ false,
6571
6948
  };
@@ -6596,9 +6973,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6596
6973
  memcpy(src0_clone->data, src0->data, src0_size);
6597
6974
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6598
6975
  } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6599
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6600
- vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6601
- uint64_t offset = extra->offset + src0->view_offs;
6976
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
6977
+ vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
6978
+ uint64_t offset = vk_tensor_offset(src0) + src0->view_offs;
6602
6979
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
6603
6980
  for (int i3 = 0; i3 < src0->ne[3]; i3++) {
6604
6981
  for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@@ -6624,7 +7001,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6624
7001
  }
6625
7002
 
6626
7003
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6627
- ggml_vk_print_tensor(ctx, src0, "src0");
7004
+ ggml_vk_print_tensor(src0, "src0");
6628
7005
  }
6629
7006
  }
6630
7007
  if (src1 != nullptr) {
@@ -6638,9 +7015,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6638
7015
  memcpy(src1_clone->data, src1->data, src1_size);
6639
7016
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6640
7017
  } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6641
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6642
- vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6643
- uint64_t offset = extra->offset + src1->view_offs;
7018
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
7019
+ vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
7020
+ uint64_t offset = vk_tensor_offset(src1) + src1->view_offs;
6644
7021
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
6645
7022
  for (int i3 = 0; i3 < src1->ne[3]; i3++) {
6646
7023
  for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@@ -6666,23 +7043,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6666
7043
  }
6667
7044
 
6668
7045
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6669
- ggml_vk_print_tensor(ctx, src1, "src1");
6670
- std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
6671
- std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6672
- if (src1->src[0] != nullptr) {
6673
- std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6674
- }
6675
- if (src1->src[1] != nullptr) {
6676
- std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6677
- }
6678
- std::cerr << std::endl << "Result:" << std::endl;
6679
- ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
6680
- std::cerr << std::endl;
6681
- std::cerr << std::endl << "Result:" << std::endl;
6682
- ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 1, 0);
6683
- std::cerr << std::endl;
6684
- std::vector<const ggml_tensor *> done;
6685
- ggml_vk_print_graph_origin(src1_clone, done);
7046
+ ggml_vk_print_tensor(src1, "src1");
6686
7047
  }
6687
7048
  }
6688
7049
  if (src2 != nullptr) {
@@ -6696,9 +7057,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6696
7057
  memcpy(src2_clone->data, src2->data, src2_size);
6697
7058
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6698
7059
  } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6699
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6700
- vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6701
- uint64_t offset = extra->offset + src2->view_offs;
7060
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src2->buffer->context;
7061
+ vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
7062
+ uint64_t offset = vk_tensor_offset(src2) + src2->view_offs;
6702
7063
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6703
7064
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6704
7065
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@@ -6724,23 +7085,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6724
7085
  }
6725
7086
 
6726
7087
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6727
- ggml_vk_print_tensor(ctx, src2, "src2");
6728
- std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
6729
- std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6730
- if (src2->src[0] != nullptr) {
6731
- std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6732
- }
6733
- if (src2->src[1] != nullptr) {
6734
- std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6735
- }
6736
- std::cerr << std::endl << "Result:" << std::endl;
6737
- ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
6738
- std::cerr << std::endl;
6739
- std::cerr << std::endl << "Result:" << std::endl;
6740
- ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
6741
- std::cerr << std::endl;
6742
- std::vector<const ggml_tensor *> done;
6743
- ggml_vk_print_graph_origin(src2_clone, done);
7088
+ ggml_vk_print_tensor(src2, "src2");
6744
7089
  }
6745
7090
  }
6746
7091
 
@@ -6752,16 +7097,32 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6752
7097
  tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone);
6753
7098
  } else if (tensor->op == GGML_OP_DIV) {
6754
7099
  tensor_clone = ggml_div(ggml_ctx, src0_clone, src1_clone);
7100
+ } else if (tensor->op == GGML_OP_CONCAT) {
7101
+ tensor_clone = ggml_concat(ggml_ctx, src0_clone, src1_clone, *(int *)tensor->op_params);
7102
+ } else if (tensor->op == GGML_OP_UPSCALE) {
7103
+ tensor_clone = ggml_upscale_ext(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
6755
7104
  } else if (tensor->op == GGML_OP_SCALE) {
6756
7105
  tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
6757
7106
  } else if (tensor->op == GGML_OP_SQR) {
6758
7107
  tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
7108
+ } else if (tensor->op == GGML_OP_SIN) {
7109
+ tensor_clone = ggml_sin(ggml_ctx, src0_clone);
7110
+ } else if (tensor->op == GGML_OP_COS) {
7111
+ tensor_clone = ggml_cos(ggml_ctx, src0_clone);
6759
7112
  } else if (tensor->op == GGML_OP_CLAMP) {
6760
7113
  tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
7114
+ } else if (tensor->op == GGML_OP_PAD) {
7115
+ tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]);
7116
+ } else if (tensor->op == GGML_OP_REPEAT) {
7117
+ tensor_clone = ggml_repeat(ggml_ctx, src0_clone, tensor);
6761
7118
  } else if (tensor->op == GGML_OP_ADD) {
6762
7119
  tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
7120
+ } else if (tensor->op == GGML_OP_ACC) {
7121
+ tensor_clone = ggml_acc(ggml_ctx, src0_clone, src1_clone, tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
6763
7122
  } else if (tensor->op == GGML_OP_NORM) {
6764
7123
  tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
7124
+ } else if (tensor->op == GGML_OP_GROUP_NORM) {
7125
+ tensor_clone = ggml_group_norm(ggml_ctx, src0_clone, *(int *)tensor->op_params, ((float *)tensor->op_params)[1]);
6765
7126
  } else if (tensor->op == GGML_OP_RMS_NORM) {
6766
7127
  tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
6767
7128
  } else if (tensor->op == GGML_OP_SOFT_MAX) {
@@ -6777,12 +7138,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6777
7138
  const int mode = ((int32_t *) tensor->op_params)[2];
6778
7139
  //const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3];
6779
7140
  const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4];
6780
- float freq_base = ((float *) tensor->op_params)[5];
6781
- float freq_scale = ((float *) tensor->op_params)[6];
6782
- float ext_factor = ((float *) tensor->op_params)[7];
6783
- float attn_factor = ((float *) tensor->op_params)[8];
6784
- float beta_fast = ((float *) tensor->op_params)[9];
6785
- float beta_slow = ((float *) tensor->op_params)[10];
7141
+ const float freq_base = ((float *) tensor->op_params)[5];
7142
+ const float freq_scale = ((float *) tensor->op_params)[6];
7143
+ const float ext_factor = ((float *) tensor->op_params)[7];
7144
+ const float attn_factor = ((float *) tensor->op_params)[8];
7145
+ const float beta_fast = ((float *) tensor->op_params)[9];
7146
+ const float beta_slow = ((float *) tensor->op_params)[10];
6786
7147
  tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6787
7148
  } else if (tensor->op == GGML_OP_UNARY) {
6788
7149
  switch (ggml_get_unary_op(tensor)) {
@@ -6792,9 +7153,15 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6792
7153
  case GGML_UNARY_OP_GELU:
6793
7154
  tensor_clone = ggml_gelu(ggml_ctx, src0_clone);
6794
7155
  break;
7156
+ case GGML_UNARY_OP_GELU_QUICK:
7157
+ tensor_clone = ggml_gelu_quick(ggml_ctx, src0_clone);
7158
+ break;
6795
7159
  case GGML_UNARY_OP_RELU:
6796
7160
  tensor_clone = ggml_relu(ggml_ctx, src0_clone);
6797
7161
  break;
7162
+ case GGML_UNARY_OP_TANH:
7163
+ tensor_clone = ggml_tanh(ggml_ctx, src0_clone);
7164
+ break;
6798
7165
  default:
6799
7166
  std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6800
7167
  GGML_ABORT("fatal error");
@@ -6823,6 +7190,23 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6823
7190
  tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params);
6824
7191
  } else if (tensor->op == GGML_OP_SUM_ROWS) {
6825
7192
  tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
7193
+ } else if (tensor->op == GGML_OP_IM2COL) {
7194
+ const int32_t s0 = tensor->op_params[0];
7195
+ const int32_t s1 = tensor->op_params[1];
7196
+ const int32_t p0 = tensor->op_params[2];
7197
+ const int32_t p1 = tensor->op_params[3];
7198
+ const int32_t d0 = tensor->op_params[4];
7199
+ const int32_t d1 = tensor->op_params[5];
7200
+
7201
+ const bool is_2D = tensor->op_params[6] == 1;
7202
+ tensor_clone = ggml_im2col(ggml_ctx, src0_clone, src1_clone, s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
7203
+ } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
7204
+ const int32_t dim = tensor->op_params[0];
7205
+ const int32_t max_period = tensor->op_params[1];
7206
+ tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
7207
+ } else if (tensor->op == GGML_OP_LEAKY_RELU) {
7208
+ const float * op_params = (const float *)tensor->op_params;
7209
+ tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
6826
7210
  } else {
6827
7211
  std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6828
7212
  GGML_ABORT("fatal error");
@@ -6834,7 +7218,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6834
7218
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
6835
7219
 
6836
7220
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6837
- ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
7221
+ ggml_vk_print_tensor(tensor_clone, "tensor_clone");
6838
7222
  }
6839
7223
 
6840
7224
  comp_size = ggml_nbytes(tensor_clone);
@@ -6851,9 +7235,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6851
7235
  }
6852
7236
 
6853
7237
  ggml_free(ggml_ctx);
7238
+
7239
+ VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")");
6854
7240
  }
6855
7241
 
6856
- static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor) {
7242
+ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
6857
7243
  if (tensor->op == GGML_OP_TRANSPOSE) {
6858
7244
  return;
6859
7245
  }
@@ -6873,14 +7259,15 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
6873
7259
  size_t tensor_size = ggml_nbytes(tensor);
6874
7260
  tensor_data = malloc(tensor_size);
6875
7261
 
6876
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7262
+ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
6877
7263
 
6878
- vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6879
- if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
6880
- tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
7264
+ vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
7265
+ uint64_t offset = vk_tensor_offset(tensor) + tensor->view_offs;
7266
+ if (offset + tensor_size >= buffer_gpu->size) {
7267
+ tensor_size = buffer_gpu->size - offset;
6881
7268
  }
6882
7269
 
6883
- ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
7270
+ ggml_vk_buffer_read(buffer_gpu, offset, tensor_data, tensor_size);
6884
7271
  }
6885
7272
 
6886
7273
  float first_error_result = -1.0f;
@@ -6977,11 +7364,6 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
6977
7364
  std::cerr << std::endl << "Correct:" << std::endl;
6978
7365
  ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0);
6979
7366
  std::cerr << std::endl;
6980
- std::cerr << std::endl << "Result:" << std::endl;
6981
- ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
6982
- std::cerr << std::endl << "Correct:" << std::endl;
6983
- ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 1, 0);
6984
- std::cerr << std::endl;
6985
7367
  std::vector<const ggml_tensor *> done;
6986
7368
  ggml_vk_print_graph_origin(tensor, done);
6987
7369
  }
@@ -7018,5 +7400,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
7018
7400
  if (ggml_backend_buffer_is_vk(tensor->buffer)) {
7019
7401
  free(tensor_data);
7020
7402
  }
7403
+
7404
+ VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
7021
7405
  }
7022
7406
  #endif