llama_cpp 0.16.0 → 0.16.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/ext/llama_cpp/extconf.rb +3 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +14 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +4 -0
  7. data/vendor/tmp/llama.cpp/Makefile +119 -54
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
  126. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  127. data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
  128. data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
  129. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
  130. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
  131. data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
  132. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
  133. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
  134. data/vendor/tmp/llama.cpp/ggml.c +158 -414
  135. data/vendor/tmp/llama.cpp/ggml.h +6 -0
  136. data/vendor/tmp/llama.cpp/llama.cpp +628 -279
  137. data/vendor/tmp/llama.cpp/llama.h +9 -1
  138. data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
  139. data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
  140. data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
  141. data/vendor/tmp/llama.cpp/unicode.h +1 -1
  142. metadata +15 -3
@@ -1,5 +1,5 @@
1
1
  #include "ggml-vulkan.h"
2
-
2
+ #include <vulkan/vulkan_core.h>
3
3
  #ifdef GGML_VULKAN_RUN_TESTS
4
4
  #include <chrono>
5
5
  #endif
@@ -8,13 +8,15 @@
8
8
 
9
9
  #include <algorithm>
10
10
  #include <cmath>
11
+ #include <iomanip>
11
12
  #include <iostream>
12
- #include <limits>
13
13
  #include <tuple>
14
14
  #include <vector>
15
15
  #include <sstream>
16
16
  #include <utility>
17
17
  #include <memory>
18
+ #include <limits>
19
+ #include <map>
18
20
 
19
21
  #include "ggml.h"
20
22
  #include "ggml-backend-impl.h"
@@ -56,6 +58,12 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
56
58
  } \
57
59
  } while (0)
58
60
 
61
+ #ifdef GGML_VULKAN_DEBUG
62
+ #define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl
63
+ #else
64
+ #define VK_LOG_DEBUG(msg) ((void) 0)
65
+ #endif // GGML_VULKAN_DEBUG
66
+
59
67
  struct ggml_backend_vk_context;
60
68
 
61
69
  struct vk_queue {
@@ -150,7 +158,7 @@ struct vk_device {
150
158
  vk_pipeline pipeline_relu_f32;
151
159
  vk_pipeline pipeline_diag_mask_inf_f32;
152
160
  vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
153
- vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
161
+ vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
154
162
  vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
155
163
  vk_pipeline pipeline_argsort_f32;
156
164
  vk_pipeline pipeline_sum_rows_f32;
@@ -158,9 +166,7 @@ struct vk_device {
158
166
  std::vector<vk_pipeline_ref> pipelines;
159
167
 
160
168
  ~vk_device() {
161
- #ifdef GGML_VULKAN_DEBUG
162
- std::cerr << "destroy device " << name << std::endl;
163
- #endif
169
+ VK_LOG_DEBUG("destroy device " << name);
164
170
  device.destroyCommandPool(compute_queue.pool);
165
171
  if (!single_queue) {
166
172
  device.destroyCommandPool(transfer_queue.pool);
@@ -195,9 +201,7 @@ struct vk_buffer_struct {
195
201
  if (size == 0) {
196
202
  return;
197
203
  }
198
- #ifdef GGML_VULKAN_DEBUG
199
- std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl;
200
- #endif
204
+ VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")");
201
205
 
202
206
  device->device.freeMemory(device_memory);
203
207
  device->device.destroyBuffer(buffer);
@@ -283,26 +287,15 @@ struct vk_op_diag_mask_push_constants {
283
287
 
284
288
  struct vk_op_rope_push_constants {
285
289
  uint32_t ncols;
290
+ uint32_t n_dims;
286
291
  float freq_scale;
287
292
  uint32_t p_delta_rows;
288
293
  float freq_base;
289
294
  float ext_factor;
290
295
  float attn_factor;
291
- float corr_dims[4];
292
- };
293
-
294
- struct vk_op_rope_neox_push_constants {
295
- uint32_t ncols;
296
- uint32_t ndims;
297
- float freq_scale;
298
- uint32_t p_delta_rows;
299
- float freq_base;
300
- float ext_factor;
301
- float attn_factor;
302
- float corr_dims[4];
296
+ float corr_dims[2];
303
297
  float theta_scale;
304
- float inv_ndims;
305
- uint32_t has_freq_facs;
298
+ uint32_t has_ff;
306
299
  };
307
300
 
308
301
  struct vk_op_soft_max_push_constants {
@@ -345,15 +338,12 @@ struct vk_context {
345
338
  };
346
339
 
347
340
  struct ggml_tensor_extra_gpu {
348
- bool ready;
349
-
350
341
  size_t ctx_idx;
351
342
 
352
343
  vk_buffer_ref buffer_gpu;
353
344
  uint64_t offset;
354
345
 
355
346
  void reset() {
356
- ready = false;
357
347
  ctx_idx = 0;
358
348
  buffer_gpu.reset();
359
349
  offset = 0;
@@ -368,6 +358,49 @@ struct ggml_vk_garbage_collector {
368
358
  std::vector<vk_context> contexts;
369
359
  };
370
360
 
361
+ #if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
362
+ #include <mutex>
363
+
364
+ #define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
365
+
366
+ static std::string format_size(size_t size) {
367
+ const size_t kib = 1024;
368
+ const size_t mib = kib * 1024;
369
+ const size_t gib = mib * 1024;
370
+
371
+ std::ostringstream oss;
372
+ oss << std::fixed << std::setprecision(2);
373
+
374
+ if (size >= gib) {
375
+ oss << static_cast<double>(size) / gib << " GiB";
376
+ } else if (size >= mib) {
377
+ oss << static_cast<double>(size) / mib << " MiB";
378
+ } else if (size >= kib) {
379
+ oss << static_cast<double>(size) / kib << " KiB";
380
+ } else {
381
+ oss << size << " B";
382
+ }
383
+
384
+ return oss.str();
385
+ }
386
+
387
+ static std::mutex log_mutex;
388
+
389
+ class vk_memory_logger {
390
+ public:
391
+ vk_memory_logger(): total_device(0), total_host(0) {}
392
+ void log_allocation(vk_buffer_ref buf_ref, size_t size);
393
+ void log_deallocation(vk_buffer_ref buf_ref);
394
+
395
+ private:
396
+ std::map<vk::Buffer, size_t> allocations; // Track allocations
397
+ size_t total_device;
398
+ size_t total_host;
399
+ };
400
+ #else
401
+ #define VK_LOG_MEMORY(msg) ((void) 0)
402
+ #endif // GGML_VULKAN_MEMORY_DEBUG
403
+
371
404
  struct ggml_backend_vk_context {
372
405
  std::string name;
373
406
 
@@ -392,8 +425,45 @@ struct ggml_backend_vk_context {
392
425
  bool initialized;
393
426
 
394
427
  size_t idx;
428
+
429
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
430
+ vk_memory_logger memory_logger;
431
+ #endif
395
432
  };
396
433
 
434
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
435
+ void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
436
+ std::lock_guard<std::mutex> guard(log_mutex);
437
+ vk_buffer buf = buf_ref.lock();
438
+ const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
439
+ const std::string type = device ? "device" : "host";
440
+ allocations[buf->buffer] = size;
441
+ total_device += device ? size : 0;
442
+ total_host += device ? 0 : size;
443
+ VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
444
+ }
445
+
446
+ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
447
+ if (buf_ref.expired() || buf_ref.lock()->size == 0) {
448
+ return;
449
+ }
450
+
451
+ std::lock_guard<std::mutex> guard(log_mutex);
452
+ vk_buffer buf = buf_ref.lock();
453
+ const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
454
+ std::string type = device ? "device" : "host";
455
+ auto it = allocations.find(buf->buffer);
456
+ total_device -= device ? it->second : 0;
457
+ total_host -= device ? 0 : it->second;
458
+ if (it != allocations.end()) {
459
+ VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
460
+ allocations.erase(it);
461
+ } else {
462
+ VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
463
+ }
464
+ }
465
+ #endif // GGML_VULKAN_MEMORY_DEBUG
466
+
397
467
  struct vk_instance_t {
398
468
  vk::Instance instance;
399
469
 
@@ -406,15 +476,11 @@ struct vk_instance_t {
406
476
  };
407
477
 
408
478
  static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
409
- #ifdef GGML_VULKAN_DEBUG
410
- std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl;
411
- #endif
479
+ VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
412
480
  static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
413
481
 
414
482
  if (devices[idx].expired()) {
415
- #ifdef GGML_VULKAN_DEBUG
416
- std::cerr << "Initializing new vk_device" << std::endl;
417
- #endif
483
+ VK_LOG_DEBUG("Initializing new vk_device");
418
484
  std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
419
485
  device->initialized = false;
420
486
  devices[idx] = device;
@@ -441,9 +507,7 @@ static vk_instance_t vk_instance;
441
507
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
442
508
 
443
509
  static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
444
- #ifdef GGML_VULKAN_DEBUG
445
- std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
446
- #endif
510
+ VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
447
511
  GGML_ASSERT(parameter_count > 0);
448
512
  GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
449
513
 
@@ -544,9 +608,7 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
544
608
  }
545
609
 
546
610
  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
547
- #ifdef GGML_VULKAN_DEBUG
548
- std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl;
549
- #endif
611
+ VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
550
612
  for (auto& pool : pipeline->descriptor_pools) {
551
613
  device.destroyDescriptorPool(pool);
552
614
  }
@@ -564,9 +626,7 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
564
626
  }
565
627
 
566
628
  static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
567
- #ifdef GGML_VULKAN_DEBUG
568
- std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl;
569
- #endif
629
+ VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
570
630
  if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
571
631
  // Enough descriptors are available
572
632
  return;
@@ -596,16 +656,12 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
596
656
  }
597
657
 
598
658
  static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
599
- #ifdef GGML_VULKAN_DEBUG
600
- std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl;
601
- #endif
659
+ VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
602
660
  pipeline->descriptor_set_idx = 0;
603
661
  }
604
662
 
605
663
  static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
606
- #ifdef GGML_VULKAN_DEBUG
607
- std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl;
608
- #endif
664
+ VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
609
665
  if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
610
666
  // Reuse command buffer
611
667
  return q.cmd_buffers[q.cmd_buffer_idx++];
@@ -625,9 +681,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx
625
681
  }
626
682
 
627
683
  static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
628
- #ifdef GGML_VULKAN_DEBUG
629
- std::cerr << "ggml_vk_create_submission()" << std::endl;
630
- #endif
684
+ VK_LOG_DEBUG("ggml_vk_create_submission()");
631
685
  vk_submission s;
632
686
  s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
633
687
  s.wait_semaphores = std::move(wait_semaphores);
@@ -636,9 +690,7 @@ static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk
636
690
  }
637
691
 
638
692
  static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
639
- #ifdef GGML_VULKAN_DEBUG
640
- std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl;
641
- #endif
693
+ VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
642
694
  if (ctx->seqs.empty()) {
643
695
  return;
644
696
  }
@@ -712,9 +764,7 @@ static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
712
764
  }
713
765
 
714
766
  static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
715
- #ifdef GGML_VULKAN_DEBUG
716
- std::cerr << "ggml_vk_find_queue_family_index()" << std::endl;
717
- #endif
767
+ VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
718
768
  const uint32_t qfsize = queue_family_props.size();
719
769
 
720
770
  // Try with avoid preferences first
@@ -760,9 +810,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
760
810
  }
761
811
 
762
812
  static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
763
- #ifdef GGML_VULKAN_DEBUG
764
- std::cerr << "ggml_vk_create_queue()" << std::endl;
765
- #endif
813
+ VK_LOG_DEBUG("ggml_vk_create_queue()");
766
814
  q.queue_family_index = queue_family_index;
767
815
 
768
816
  vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
@@ -776,9 +824,7 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin
776
824
  }
777
825
 
778
826
  static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
779
- #ifdef GGML_VULKAN_DEBUG
780
- std::cerr << "ggml_vk_create_context()" << std::endl;
781
- #endif
827
+ VK_LOG_DEBUG("ggml_vk_create_context()");
782
828
  ctx->gc.contexts.emplace_back();
783
829
  vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
784
830
  memset((void *) result, 0, sizeof(vk_context));
@@ -788,9 +834,7 @@ static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_que
788
834
  }
789
835
 
790
836
  static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
791
- #ifdef GGML_VULKAN_DEBUG
792
- std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
793
- #endif
837
+ VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
794
838
  vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
795
839
  vk::SemaphoreCreateInfo ci{};
796
840
  ci.setPNext(&tci);
@@ -800,9 +844,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context *
800
844
  }
801
845
 
802
846
  static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
803
- #ifdef GGML_VULKAN_DEBUG
804
- std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
805
- #endif
847
+ VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
806
848
  if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
807
849
  vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
808
850
  vk::SemaphoreCreateInfo ci{};
@@ -821,9 +863,7 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
821
863
  }
822
864
 
823
865
  static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
824
- #ifdef GGML_VULKAN_DEBUG
825
- std::cerr << "ggml_vk_queue_cleanup()" << std::endl;
826
- #endif
866
+ VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
827
867
  // Requires command buffers to be done
828
868
 
829
869
  ctx->device->device.resetCommandPool(q.pool);
@@ -843,9 +883,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
843
883
  }
844
884
 
845
885
  static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
846
- #ifdef GGML_VULKAN_DEBUG
847
- std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
848
- #endif
886
+ VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
849
887
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
850
888
 
851
889
  if (size == 0) {
@@ -905,8 +943,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
905
943
 
906
944
  buf->device = ctx->device;
907
945
 
908
- #ifdef GGML_VULKAN_DEBUG
909
- std::cerr << "Created buffer " << buf->buffer << std::endl;
946
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
947
+ ctx->memory_logger.log_allocation(buf, size);
910
948
  #endif
911
949
 
912
950
  return buf;
@@ -941,6 +979,14 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz
941
979
  }
942
980
 
943
981
  static void ggml_vk_destroy_buffer(vk_buffer& buf) {
982
+ if (buf == nullptr) {
983
+ return;
984
+ }
985
+
986
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
987
+ buf->ctx->memory_logger.log_deallocation(buf);
988
+ #endif
989
+
944
990
  buf.reset();
945
991
  }
946
992
 
@@ -949,9 +995,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
949
995
  }
950
996
 
951
997
  static void ggml_vk_sync_buffers(vk_context * ctx) {
952
- #ifdef GGML_VULKAN_DEBUG
953
- std::cerr << "ggml_vk_sync_buffers()" << std::endl;
954
- #endif
998
+ VK_LOG_DEBUG("ggml_vk_sync_buffers()");
955
999
  const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
956
1000
 
957
1001
  ctx->s->buffer.pipelineBarrier(
@@ -965,9 +1009,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) {
965
1009
  }
966
1010
 
967
1011
  static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
968
- #ifdef GGML_VULKAN_DEBUG
969
- std::cerr << "ggml_vk_wait_events()" << std::endl;
970
- #endif
1012
+ VK_LOG_DEBUG("ggml_vk_wait_events()");
971
1013
  if (events.empty()) {
972
1014
  return;
973
1015
  }
@@ -1002,9 +1044,7 @@ static bool ggml_vk_build_shader(ggml_type type) {
1002
1044
  }
1003
1045
 
1004
1046
  static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1005
- #ifdef GGML_VULKAN_DEBUG
1006
- std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
1007
- #endif
1047
+ VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")");
1008
1048
 
1009
1049
  const std::shared_ptr<vk_device> device = ctx->device;
1010
1050
 
@@ -1055,12 +1095,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1055
1095
  ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1056
1096
 
1057
1097
  if (device->fp16) {
1058
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1059
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1060
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1061
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1062
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1063
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1098
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1099
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1100
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1101
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1102
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1103
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1064
1104
 
1065
1105
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1066
1106
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1153,12 +1193,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1153
1193
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1154
1194
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1155
1195
 
1156
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1157
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1158
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1159
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1160
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1161
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1196
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1197
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1198
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1199
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1200
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1201
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1162
1202
 
1163
1203
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1164
1204
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1244,12 +1284,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1244
1284
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1245
1285
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1246
1286
  } else {
1247
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1248
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1249
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1250
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1251
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1252
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1287
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1288
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1289
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1290
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1291
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1292
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1253
1293
 
1254
1294
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1255
1295
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1342,12 +1382,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1342
1382
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1343
1383
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1344
1384
 
1345
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1346
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1347
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1348
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1349
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1350
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1385
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1386
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1387
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1388
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1389
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1390
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1351
1391
 
1352
1392
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1353
1393
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1442,11 +1482,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1442
1482
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1443
1483
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1444
1484
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1445
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32_f32", mul_mat_vec_q2_K_f32_f32_len, mul_mat_vec_q2_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1446
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32_f32", mul_mat_vec_q3_K_f32_f32_len, mul_mat_vec_q3_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1447
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32_f32", mul_mat_vec_q4_K_f32_f32_len, mul_mat_vec_q4_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1448
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32_f32", mul_mat_vec_q5_K_f32_f32_len, mul_mat_vec_q5_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1449
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32_f32", mul_mat_vec_q6_K_f32_f32_len, mul_mat_vec_q6_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1485
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1486
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1487
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1488
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1489
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1450
1490
 
1451
1491
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1452
1492
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1455,11 +1495,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1455
1495
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1456
1496
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1457
1497
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1458
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f16_f32", mul_mat_vec_q2_K_f16_f32_len, mul_mat_vec_q2_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1459
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f16_f32", mul_mat_vec_q3_K_f16_f32_len, mul_mat_vec_q3_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1460
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f16_f32", mul_mat_vec_q4_K_f16_f32_len, mul_mat_vec_q4_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1461
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f16_f32", mul_mat_vec_q5_K_f16_f32_len, mul_mat_vec_q5_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1462
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f16_f32", mul_mat_vec_q6_K_f16_f32_len, mul_mat_vec_q6_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1498
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1499
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1500
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1501
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1502
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1463
1503
 
1464
1504
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1465
1505
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1468,11 +1508,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1468
1508
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1469
1509
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1470
1510
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1471
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_K_f32", mul_mat_vec_id_q2_K_f32_len, mul_mat_vec_id_q2_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1472
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_K_f32", mul_mat_vec_id_q3_K_f32_len, mul_mat_vec_id_q3_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1473
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_K_f32", mul_mat_vec_id_q4_K_f32_len, mul_mat_vec_id_q4_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1474
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_K_f32", mul_mat_vec_id_q5_K_f32_len, mul_mat_vec_id_q5_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1475
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_K_f32", mul_mat_vec_id_q6_K_f32_len, mul_mat_vec_id_q6_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1511
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1512
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1513
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1514
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1515
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1476
1516
 
1477
1517
  // dequant shaders
1478
1518
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -1481,11 +1521,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1481
1521
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1482
1522
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1483
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1484
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1485
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1486
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1487
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1488
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1524
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1525
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1526
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1528
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1489
1529
 
1490
1530
  // get_rows
1491
1531
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -1537,11 +1577,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1537
1577
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1538
1578
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1539
1579
 
1540
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1541
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1580
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1581
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1542
1582
 
1543
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1544
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1583
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1584
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1545
1585
 
1546
1586
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1547
1587
 
@@ -1551,9 +1591,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1551
1591
  static void ggml_vk_print_gpu_info(size_t idx) {
1552
1592
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1553
1593
  size_t dev_num = vk_instance.device_indices[idx];
1554
- #ifdef GGML_VULKAN_DEBUG
1555
- std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl;
1556
- #endif
1594
+ VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")");
1557
1595
  GGML_ASSERT(vk_instance.initialized);
1558
1596
 
1559
1597
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -1569,8 +1607,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1569
1607
  vk::PhysicalDeviceProperties2 props2;
1570
1608
  vk::PhysicalDeviceMaintenance3Properties props3;
1571
1609
  vk::PhysicalDeviceSubgroupProperties subgroup_props;
1610
+ vk::PhysicalDeviceDriverProperties driver_props;
1572
1611
  props2.pNext = &props3;
1573
1612
  props3.pNext = &subgroup_props;
1613
+ subgroup_props.pNext = &driver_props;
1574
1614
  physical_device.getProperties2(&props2);
1575
1615
 
1576
1616
  const size_t subgroup_size = subgroup_props.subgroupSize;
@@ -1614,7 +1654,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1614
1654
  fp16 = fp16 && vk12_features.shaderFloat16;
1615
1655
 
1616
1656
  std::string device_name = props2.properties.deviceName.data();
1617
- std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1657
+ std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1618
1658
 
1619
1659
  if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
1620
1660
  std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
@@ -1628,9 +1668,7 @@ void ggml_vk_instance_init() {
1628
1668
  if (vk_instance_initialized) {
1629
1669
  return;
1630
1670
  }
1631
- #ifdef GGML_VULKAN_DEBUG
1632
- std::cerr << "ggml_vk_instance_init()" << std::endl;
1633
- #endif
1671
+ VK_LOG_DEBUG("ggml_vk_instance_init()");
1634
1672
 
1635
1673
  vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
1636
1674
 
@@ -1707,10 +1745,80 @@ void ggml_vk_instance_init() {
1707
1745
 
1708
1746
  // Default to using all dedicated GPUs
1709
1747
  for (size_t i = 0; i < devices.size(); i++) {
1710
- vk::PhysicalDeviceProperties props = devices[i].getProperties();
1748
+ vk::PhysicalDeviceProperties2 new_props;
1749
+ vk::PhysicalDeviceDriverProperties new_driver;
1750
+ vk::PhysicalDeviceIDProperties new_id;
1751
+ new_props.pNext = &new_driver;
1752
+ new_driver.pNext = &new_id;
1753
+ devices[i].getProperties2(&new_props);
1754
+
1755
+ if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1756
+ // Check if there are two physical devices corresponding to the same GPU
1757
+ auto old_device = std::find_if(
1758
+ vk_instance.device_indices.begin(),
1759
+ vk_instance.device_indices.end(),
1760
+ [&devices, &new_id](const size_t k){
1761
+ vk::PhysicalDeviceProperties2 old_props;
1762
+ vk::PhysicalDeviceIDProperties old_id;
1763
+ old_props.pNext = &old_id;
1764
+ devices[k].getProperties2(&old_props);
1765
+ return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
1766
+ }
1767
+ );
1768
+ if (old_device == vk_instance.device_indices.end()) {
1769
+ vk_instance.device_indices.push_back(i);
1770
+ } else {
1771
+ // There can be two physical devices corresponding to the same GPU if there are 2 different drivers
1772
+ // This can cause error when splitting layers aross the devices, need to keep only 1
1773
+ VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID");
1774
+
1775
+ vk::PhysicalDeviceProperties2 old_props;
1776
+ vk::PhysicalDeviceDriverProperties old_driver;
1777
+ old_props.pNext = &old_driver;
1778
+ devices[*old_device].getProperties2(&old_props);
1779
+
1780
+ std::map<vk::DriverId, int> driver_priorities {};
1781
+ int old_priority = std::numeric_limits<int>::max();
1782
+ int new_priority = std::numeric_limits<int>::max();
1783
+
1784
+ // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
1785
+ // Smaller number -> higher priority
1786
+ switch (old_props.properties.vendorID) {
1787
+ case VK_VENDOR_ID_AMD:
1788
+ driver_priorities[vk::DriverId::eMesaRadv] = 1;
1789
+ driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
1790
+ driver_priorities[vk::DriverId::eAmdProprietary] = 3;
1791
+ break;
1792
+ case VK_VENDOR_ID_INTEL:
1793
+ driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
1794
+ driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
1795
+ break;
1796
+ case VK_VENDOR_ID_NVIDIA:
1797
+ driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
1798
+ #if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
1799
+ driver_priorities[vk::DriverId::eMesaNvk] = 2;
1800
+ #endif
1801
+ break;
1802
+ }
1803
+
1804
+ if (driver_priorities.count(old_driver.driverID)) {
1805
+ old_priority = driver_priorities[old_driver.driverID];
1806
+ }
1807
+ if (driver_priorities.count(new_driver.driverID)) {
1808
+ new_priority = driver_priorities[new_driver.driverID];
1809
+ }
1810
+
1811
+ if (new_priority < old_priority) {
1812
+ auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
1813
+ vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
1814
+ vk_instance.device_indices.push_back(i);
1711
1815
 
1712
- if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1713
- vk_instance.device_indices.push_back(i);
1816
+ VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName);
1817
+ }
1818
+ else {
1819
+ VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl);
1820
+ }
1821
+ }
1714
1822
  }
1715
1823
  }
1716
1824
 
@@ -1732,9 +1840,7 @@ void ggml_vk_instance_init() {
1732
1840
  static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1733
1841
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1734
1842
  size_t dev_num = vk_instance.device_indices[idx];
1735
- #ifdef GGML_VULKAN_DEBUG
1736
- std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl;
1737
- #endif
1843
+ VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")");
1738
1844
  ggml_vk_instance_init();
1739
1845
 
1740
1846
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -1907,9 +2013,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1907
2013
  }
1908
2014
 
1909
2015
  static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
1910
- #ifdef GGML_VULKAN_DEBUG
1911
- std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
1912
- #endif
2016
+ VK_LOG_DEBUG("ggml_vk_get_to_fp16()");
1913
2017
  switch (type) {
1914
2018
  case GGML_TYPE_F32:
1915
2019
  case GGML_TYPE_Q4_0:
@@ -1931,9 +2035,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
1931
2035
  }
1932
2036
 
1933
2037
  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
1934
- #ifdef GGML_VULKAN_DEBUG
1935
- std::cerr << "ggml_vk_get_mul_mat_mat_pipeline()" << std::endl;
1936
- #endif
2038
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
1937
2039
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
1938
2040
  return ctx->device->pipeline_matmul_f32;
1939
2041
  }
@@ -1969,9 +2071,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
1969
2071
  }
1970
2072
 
1971
2073
  static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
1972
- #ifdef GGML_VULKAN_DEBUG
1973
- std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
1974
- #endif
2074
+ VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
1975
2075
  GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
1976
2076
 
1977
2077
  switch (a_type) {
@@ -1996,9 +2096,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
1996
2096
  }
1997
2097
 
1998
2098
  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
1999
- #ifdef GGML_VULKAN_DEBUG
2000
- std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl;
2001
- #endif
2099
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()");
2002
2100
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
2003
2101
  return ctx->device->pipeline_matmul_id_f32;
2004
2102
  }
@@ -2031,9 +2129,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
2031
2129
  }
2032
2130
 
2033
2131
  static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
2034
- #ifdef GGML_VULKAN_DEBUG
2035
- std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
2036
- #endif
2132
+ VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
2037
2133
  GGML_ASSERT(b_type == GGML_TYPE_F32);
2038
2134
 
2039
2135
  switch (a_type) {
@@ -2058,9 +2154,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
2058
2154
  }
2059
2155
 
2060
2156
  static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
2061
- #ifdef GGML_VULKAN_DEBUG
2062
- std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl;
2063
- #endif
2157
+ VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
2158
+ VK_LOG_MEMORY("ggml_vk_pool_malloc");
2159
+
2064
2160
  int best_i = -1;
2065
2161
  size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
2066
2162
  int worst_i = -1;
@@ -2088,13 +2184,11 @@ static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size)
2088
2184
  ggml_vk_destroy_buffer(b);
2089
2185
  }
2090
2186
 
2091
- return ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
2187
+ return ggml_vk_create_buffer_device(ctx, size);
2092
2188
  }
2093
2189
 
2094
2190
  static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
2095
- #ifdef GGML_VULKAN_DEBUG
2096
- std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl;
2097
- #endif
2191
+ VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
2098
2192
  for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
2099
2193
  vk_buffer& b = ctx->buffer_pool[i];
2100
2194
  if (b == nullptr) {
@@ -2115,6 +2209,8 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
2115
2209
  }
2116
2210
  }
2117
2211
 
2212
+ VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
2213
+
2118
2214
  // Otherwise create new buffer
2119
2215
  vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
2120
2216
  ctx->gc.temp_buffers.push_back(buf);
@@ -2123,9 +2219,7 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
2123
2219
  }
2124
2220
 
2125
2221
  static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
2126
- #ifdef GGML_VULKAN_DEBUG
2127
- std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
2128
- #endif
2222
+ VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
2129
2223
  vk_buffer buf = ggml_vk_create_buffer(ctx, size,
2130
2224
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
2131
2225
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
@@ -2147,9 +2241,7 @@ static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) {
2147
2241
  if (ptr == nullptr) {
2148
2242
  return;
2149
2243
  }
2150
- #ifdef GGML_VULKAN_DEBUG
2151
- std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl;
2152
- #endif
2244
+ VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
2153
2245
  vk_buffer buf;
2154
2246
  size_t index;
2155
2247
  for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
@@ -2201,13 +2293,11 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context
2201
2293
  const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
2202
2294
  const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
2203
2295
  const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
2204
- #ifdef GGML_VULKAN_DEBUG
2205
- std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2296
+ VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2206
2297
  for (auto& buffer : buffers) {
2207
2298
  std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
2208
2299
  }
2209
- std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
2210
- #endif
2300
+ std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
2211
2301
  std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
2212
2302
  std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
2213
2303
  GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
@@ -2240,9 +2330,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
2240
2330
  }
2241
2331
 
2242
2332
  static void ggml_vk_ctx_end(vk_context * ctx) {
2243
- #ifdef GGML_VULKAN_DEBUG
2244
- std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl;
2245
- #endif
2333
+ VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
2246
2334
  if (ctx->s == nullptr) {
2247
2335
  return;
2248
2336
  }
@@ -2252,9 +2340,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
2252
2340
  }
2253
2341
 
2254
2342
  static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) {
2255
- #ifdef GGML_VULKAN_DEBUG
2256
- std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl;
2257
- #endif
2343
+ VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")");
2258
2344
  if (subctx->s != nullptr) {
2259
2345
  ggml_vk_ctx_end(subctx);
2260
2346
  }
@@ -2264,9 +2350,7 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx
2264
2350
  }
2265
2351
 
2266
2352
  static size_t ggml_vk_align_size(size_t width, size_t align) {
2267
- #ifdef GGML_VULKAN_DEBUG
2268
- std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl;
2269
- #endif
2353
+ VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
2270
2354
  return CEIL_DIV(width, align) * align;
2271
2355
  }
2272
2356
 
@@ -2280,6 +2364,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
2280
2364
 
2281
2365
  static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
2282
2366
  if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
2367
+ VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
2283
2368
  ggml_vk_destroy_buffer(ctx->sync_staging);
2284
2369
  ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
2285
2370
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
@@ -2288,9 +2373,7 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si
2288
2373
  }
2289
2374
 
2290
2375
  static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
2291
- #ifdef GGML_VULKAN_DEBUG
2292
- std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
2293
- #endif
2376
+ VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
2294
2377
  GGML_ASSERT(!ggml_is_contiguous(tensor));
2295
2378
  // Buffer is already mapped
2296
2379
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
@@ -2395,9 +2478,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
2395
2478
  }
2396
2479
 
2397
2480
  static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
2398
- #ifdef GGML_VULKAN_DEBUG
2399
- std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
2400
- #endif
2481
+ VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
2401
2482
  // Make sure ctx owns the buffer
2402
2483
  GGML_ASSERT(dst->ctx == ctx);
2403
2484
 
@@ -2432,9 +2513,7 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
2432
2513
  subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
2433
2514
  return;
2434
2515
  }
2435
- #ifdef GGML_VULKAN_DEBUG
2436
- std::cerr << "STAGING" << std::endl;
2437
- #endif
2516
+ VK_LOG_DEBUG("STAGING");
2438
2517
 
2439
2518
  // Staging buffer required
2440
2519
  vk_buffer staging = ctx->staging;
@@ -2469,16 +2548,12 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
2469
2548
  }
2470
2549
 
2471
2550
  static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
2472
- #ifdef GGML_VULKAN_DEBUG
2473
- std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
2474
- #endif
2551
+ VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
2475
2552
  return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging);
2476
2553
  }
2477
2554
 
2478
2555
  static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
2479
- #ifdef GGML_VULKAN_DEBUG
2480
- std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
2481
- #endif
2556
+ VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
2482
2557
  // Buffer is already mapped
2483
2558
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2484
2559
  GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
@@ -2503,16 +2578,12 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
2503
2578
  }
2504
2579
 
2505
2580
  static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) {
2506
- #ifdef GGML_VULKAN_DEBUG
2507
- std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
2508
- #endif
2581
+ VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
2509
2582
  ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1);
2510
2583
  }
2511
2584
 
2512
2585
  static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
2513
- #ifdef GGML_VULKAN_DEBUG
2514
- std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
2515
- #endif
2586
+ VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
2516
2587
  GGML_ASSERT(width > 0);
2517
2588
  GGML_ASSERT(height > 0);
2518
2589
  GGML_ASSERT(src != nullptr);
@@ -2546,9 +2617,7 @@ static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_conte
2546
2617
 
2547
2618
  return;
2548
2619
  }
2549
- #ifdef GGML_VULKAN_DEBUG
2550
- std::cerr << "STAGING" << std::endl;
2551
- #endif
2620
+ VK_LOG_DEBUG("STAGING");
2552
2621
 
2553
2622
  // Fall back to staging buffer
2554
2623
  vk_buffer staging = ctx->staging;
@@ -2575,9 +2644,7 @@ static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context
2575
2644
  }
2576
2645
 
2577
2646
  static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) {
2578
- #ifdef GGML_VULKAN_DEBUG
2579
- std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl;
2580
- #endif
2647
+ VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
2581
2648
  if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2582
2649
  GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
2583
2650
 
@@ -2599,9 +2666,7 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
2599
2666
  }
2600
2667
 
2601
2668
  static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2602
- #ifdef GGML_VULKAN_DEBUG
2603
- std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl;
2604
- #endif
2669
+ VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
2605
2670
  // Make sure both buffers are on same ctx
2606
2671
  GGML_ASSERT(src->ctx == dst->ctx);
2607
2672
 
@@ -2612,9 +2677,7 @@ static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t d
2612
2677
 
2613
2678
  static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2614
2679
  if (src->ctx == dst->ctx) {
2615
- #ifdef GGML_VULKAN_DEBUG
2616
- std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl;
2617
- #endif
2680
+ VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
2618
2681
  // Copy within the device
2619
2682
  ggml_backend_vk_context * ctx = src->ctx;
2620
2683
 
@@ -2626,9 +2689,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2626
2689
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
2627
2690
  ctx->device->device.resetFences({ ctx->fence });
2628
2691
  } else {
2629
- #ifdef GGML_VULKAN_DEBUG
2630
- std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
2631
- #endif
2692
+ VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
2632
2693
  // Copy device to device
2633
2694
  ggml_backend_vk_context * src_ctx = src->ctx;
2634
2695
  ggml_backend_vk_context * dst_ctx = dst->ctx;
@@ -2646,9 +2707,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2646
2707
  }
2647
2708
 
2648
2709
  static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
2649
- #ifdef GGML_VULKAN_DEBUG
2650
- std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
2651
- #endif
2710
+ VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
2652
2711
  // Make sure ctx owns the buffer
2653
2712
  GGML_ASSERT(dst->ctx == ctx);
2654
2713
 
@@ -2663,9 +2722,7 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst,
2663
2722
  }
2664
2723
 
2665
2724
  static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
2666
- #ifdef GGML_VULKAN_DEBUG
2667
- std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
2668
- #endif
2725
+ VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")");
2669
2726
  const uint64_t ne0 = src->ne[0];
2670
2727
  const uint64_t ne1 = src->ne[1];
2671
2728
  const uint64_t nb0 = src->nb[0];
@@ -2693,9 +2750,7 @@ static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
2693
2750
  }
2694
2751
 
2695
2752
  static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) {
2696
- #ifdef GGML_VULKAN_DEBUG
2697
- std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl;
2698
- #endif
2753
+ VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()");
2699
2754
  const uint64_t ne0 = dst->ne[0];
2700
2755
  const uint64_t ne1 = dst->ne[1];
2701
2756
  const uint64_t ne2 = dst->ne[2];
@@ -2719,9 +2774,7 @@ static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
2719
2774
  }
2720
2775
 
2721
2776
  static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
2722
- #ifdef GGML_VULKAN_DEBUG
2723
- std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
2724
- #endif
2777
+ VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
2725
2778
  // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
2726
2779
  // return 4;
2727
2780
  // }
@@ -2753,9 +2806,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context *
2753
2806
  }
2754
2807
 
2755
2808
  static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
2756
- #ifdef GGML_VULKAN_DEBUG
2757
- std::cerr << "ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")" << std::endl;
2758
- #endif
2809
+ VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
2759
2810
  switch (ctx->device->vendor_id) {
2760
2811
  case VK_VENDOR_ID_AMD:
2761
2812
  return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned);
@@ -2777,9 +2828,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2777
2828
  }
2778
2829
 
2779
2830
  static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
2780
- #ifdef GGML_VULKAN_DEBUG
2781
- std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
2782
- #endif
2831
+ VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
2783
2832
  return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
2784
2833
  }
2785
2834
 
@@ -2789,9 +2838,7 @@ static void ggml_vk_matmul(
2789
2838
  uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2790
2839
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2791
2840
  uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
2792
- #ifdef GGML_VULKAN_DEBUG
2793
- std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")" << std::endl;
2794
- #endif
2841
+ VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
2795
2842
  ggml_vk_sync_buffers(subctx);
2796
2843
  if (split_k == 1) {
2797
2844
  const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
@@ -2815,12 +2862,10 @@ static void ggml_vk_matmul_id(
2815
2862
  uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2816
2863
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2817
2864
  uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
2818
- #ifdef GGML_VULKAN_DEBUG
2819
- std::cerr << "ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
2865
+ VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
2820
2866
  "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
2821
2867
  "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
2822
- "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")" << std::endl;
2823
- #endif
2868
+ "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
2824
2869
  ggml_vk_sync_buffers(subctx);
2825
2870
  const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
2826
2871
  nei0, nei1, nbi1, ne11 };
@@ -2850,10 +2895,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2850
2895
  }
2851
2896
 
2852
2897
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2853
- #ifdef GGML_VULKAN_DEBUG
2854
- std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2855
- std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
2856
- #endif
2898
+ VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2899
+ std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
2857
2900
  const int tensor_type_size = ggml_type_size(tensor->type);
2858
2901
 
2859
2902
  const uint32_t ne = ggml_nelements(tensor);
@@ -2870,11 +2913,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
2870
2913
  }
2871
2914
 
2872
2915
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2873
- #ifdef GGML_VULKAN_DEBUG
2874
- std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2916
+ VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2875
2917
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2876
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2877
- #endif
2918
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
2878
2919
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2879
2920
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
2880
2921
 
@@ -2949,7 +2990,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2949
2990
  const uint64_t d_sz = sizeof(float) * d_ne;
2950
2991
 
2951
2992
  vk_buffer d_D = extra->buffer_gpu.lock();
2952
- const uint64_t d_buf_offset = extra->offset;
2993
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
2953
2994
  GGML_ASSERT(d_D != nullptr);
2954
2995
  GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
2955
2996
  vk_buffer d_X;
@@ -2958,12 +2999,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2958
2999
  uint64_t y_buf_offset = 0;
2959
3000
  if (!src0_uma) {
2960
3001
  d_Qx = extra_src0->buffer_gpu.lock();
2961
- qx_buf_offset = extra_src0->offset;
3002
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
2962
3003
  GGML_ASSERT(d_Qx != nullptr);
2963
3004
  }
2964
3005
  if (!src1_uma) {
2965
3006
  d_Qy = extra_src1->buffer_gpu.lock();
2966
- qy_buf_offset = extra_src1->offset;
3007
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
2967
3008
  GGML_ASSERT(d_Qy != nullptr);
2968
3009
  }
2969
3010
  if (qx_needs_dequant) {
@@ -3045,11 +3086,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
3045
3086
  }
3046
3087
 
3047
3088
  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3048
- #ifdef GGML_VULKAN_DEBUG
3049
- std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3089
+ VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3050
3090
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3051
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3052
- #endif
3091
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3053
3092
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3054
3093
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3055
3094
 
@@ -3114,7 +3153,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3114
3153
  const uint64_t d_sz = sizeof(float) * d_ne;
3115
3154
 
3116
3155
  vk_buffer d_D = extra->buffer_gpu.lock();
3117
- const uint64_t d_buf_offset = extra->offset;
3156
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3118
3157
  GGML_ASSERT(d_D != nullptr);
3119
3158
  vk_buffer d_X;
3120
3159
  uint64_t x_buf_offset = 0;
@@ -3122,12 +3161,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3122
3161
  uint64_t y_buf_offset = 0;
3123
3162
  if(!src0_uma) {
3124
3163
  d_Qx = extra_src0->buffer_gpu.lock();
3125
- qx_buf_offset = extra_src0->offset;
3164
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3126
3165
  GGML_ASSERT(d_Qx != nullptr);
3127
3166
  }
3128
3167
  if(!src1_uma) {
3129
3168
  d_Qy = extra_src1->buffer_gpu.lock();
3130
- qy_buf_offset = extra_src1->offset;
3169
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3131
3170
  GGML_ASSERT(d_Qy != nullptr);
3132
3171
  }
3133
3172
  if (qx_needs_dequant) {
@@ -3200,11 +3239,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3200
3239
  }
3201
3240
 
3202
3241
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3203
- #ifdef GGML_VULKAN_DEBUG
3204
- std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3242
+ VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3205
3243
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3206
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3207
- #endif
3244
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3208
3245
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3209
3246
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3210
3247
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
@@ -3246,14 +3283,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3246
3283
  const uint64_t d_sz = sizeof(float) * d_ne;
3247
3284
 
3248
3285
  vk_buffer d_D = extra->buffer_gpu.lock();
3249
- const uint64_t d_buf_offset = extra->offset;
3286
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3250
3287
  GGML_ASSERT(d_D != nullptr);
3251
3288
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3252
- const uint64_t qx_buf_offset = extra_src0->offset;
3289
+ const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3253
3290
  GGML_ASSERT(d_Qx != nullptr);
3254
3291
  if (!src1_uma) {
3255
3292
  d_Qy = extra_src1->buffer_gpu.lock();
3256
- qy_buf_offset = extra_src1->offset;
3293
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3257
3294
  GGML_ASSERT(d_Qx != nullptr);
3258
3295
  }
3259
3296
 
@@ -3273,11 +3310,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3273
3310
  }
3274
3311
 
3275
3312
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3276
- #ifdef GGML_VULKAN_DEBUG
3277
- std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3313
+ VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3278
3314
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3279
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3280
- #endif
3315
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3281
3316
  GGML_ASSERT(!ggml_is_transposed(src0));
3282
3317
  GGML_ASSERT(!ggml_is_transposed(src1));
3283
3318
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -3323,14 +3358,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3323
3358
  const uint64_t d_sz = sizeof(float) * d_ne;
3324
3359
 
3325
3360
  vk_buffer d_D = extra->buffer_gpu.lock();
3326
- const uint64_t d_buf_offset = extra->offset;
3361
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3327
3362
  GGML_ASSERT(d_D != nullptr);
3328
3363
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3329
- const uint64_t qx_buf_offset = extra_src0->offset;
3364
+ const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3330
3365
  GGML_ASSERT(d_Qx != nullptr);
3331
3366
  if (!src1_uma) {
3332
3367
  d_Qy = extra_src1->buffer_gpu.lock();
3333
- qy_buf_offset = extra_src1->offset;
3368
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3334
3369
  GGML_ASSERT(d_Qx != nullptr);
3335
3370
  }
3336
3371
 
@@ -3350,9 +3385,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3350
3385
  }
3351
3386
 
3352
3387
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3353
- #ifdef GGML_VULKAN_DEBUG
3354
- std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
3355
- #endif
3388
+ VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
3356
3389
  if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
3357
3390
  ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
3358
3391
  } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
@@ -3365,12 +3398,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
3365
3398
  }
3366
3399
 
3367
3400
  static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3368
- #ifdef GGML_VULKAN_DEBUG
3369
- std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3401
+ VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3370
3402
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3371
3403
  std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3372
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3373
- #endif
3404
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3374
3405
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3375
3406
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
3376
3407
 
@@ -3459,7 +3490,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3459
3490
  const uint64_t d_sz = sizeof(float) * d_ne;
3460
3491
 
3461
3492
  vk_buffer d_D = extra->buffer_gpu.lock();
3462
- const uint64_t d_buf_offset = extra->offset;
3493
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3463
3494
  GGML_ASSERT(d_D != nullptr);
3464
3495
  vk_buffer d_X;
3465
3496
  uint64_t x_buf_offset = 0;
@@ -3467,17 +3498,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3467
3498
  uint64_t y_buf_offset = 0;
3468
3499
  if (!src0_uma) {
3469
3500
  d_Qx = extra_src0->buffer_gpu.lock();
3470
- qx_buf_offset = extra_src0->offset;
3501
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3471
3502
  GGML_ASSERT(d_Qx != nullptr);
3472
3503
  }
3473
3504
  if (!src1_uma) {
3474
3505
  d_Qy = extra_src1->buffer_gpu.lock();
3475
- qy_buf_offset = extra_src1->offset;
3506
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3476
3507
  GGML_ASSERT(d_Qy != nullptr);
3477
3508
  }
3478
3509
  if (!ids_uma) {
3479
3510
  d_ids = extra_ids->buffer_gpu.lock();
3480
- ids_buf_offset = extra_ids->offset;
3511
+ ids_buf_offset = extra_ids->offset + ids->view_offs;
3481
3512
  GGML_ASSERT(d_ids != nullptr);
3482
3513
  }
3483
3514
  if (qx_needs_dequant) {
@@ -3556,12 +3587,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3556
3587
  }
3557
3588
 
3558
3589
  static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3559
- #ifdef GGML_VULKAN_DEBUG
3560
- std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3590
+ VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3561
3591
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3562
3592
  std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3563
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3564
- #endif
3593
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3565
3594
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3566
3595
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3567
3596
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -3636,7 +3665,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3636
3665
  const uint64_t d_sz = sizeof(float) * d_ne;
3637
3666
 
3638
3667
  vk_buffer d_D = extra->buffer_gpu.lock();
3639
- const uint64_t d_buf_offset = extra->offset;
3668
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3640
3669
  GGML_ASSERT(d_D != nullptr);
3641
3670
  vk_buffer d_X;
3642
3671
  uint64_t x_buf_offset = 0;
@@ -3644,17 +3673,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3644
3673
  uint64_t y_buf_offset = 0;
3645
3674
  if(!src0_uma) {
3646
3675
  d_Qx = extra_src0->buffer_gpu.lock();
3647
- qx_buf_offset = extra_src0->offset;
3676
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3648
3677
  GGML_ASSERT(d_Qx != nullptr);
3649
3678
  }
3650
3679
  if(!src1_uma) {
3651
3680
  d_Qy = extra_src1->buffer_gpu.lock();
3652
- qy_buf_offset = extra_src1->offset;
3681
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3653
3682
  GGML_ASSERT(d_Qy != nullptr);
3654
3683
  }
3655
3684
  if(!ids_uma) {
3656
3685
  d_ids = extra_ids->buffer_gpu.lock();
3657
- ids_buf_offset = extra_ids->offset;
3686
+ ids_buf_offset = extra_ids->offset + ids->view_offs;
3658
3687
  GGML_ASSERT(d_ids != nullptr);
3659
3688
  }
3660
3689
  if (qx_needs_dequant) {
@@ -3724,9 +3753,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3724
3753
  }
3725
3754
 
3726
3755
  static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
3727
- #ifdef GGML_VULKAN_DEBUG
3728
- std::cerr << "ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")" << std::endl;
3729
- #endif
3756
+ VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
3730
3757
  if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
3731
3758
  ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
3732
3759
  } else {
@@ -3769,9 +3796,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3769
3796
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3770
3797
 
3771
3798
  const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
3772
- const uint64_t src_offset = extra_src0->offset;
3799
+ const uint64_t src_offset = extra_src0->offset + src0->view_offs;
3773
3800
  vk_buffer dst_buf = extra->buffer_gpu.lock();
3774
- const uint64_t dst_offset = extra->offset;
3801
+ const uint64_t dst_offset = extra->offset + dst->view_offs;
3775
3802
 
3776
3803
  std::vector<vk::BufferCopy> copies;
3777
3804
 
@@ -3908,10 +3935,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3908
3935
  }
3909
3936
  } else {
3910
3937
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3911
- return ctx->device->pipeline_rope_f32;
3938
+ return ctx->device->pipeline_rope_norm_f32;
3912
3939
  }
3913
3940
  if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3914
- return ctx->device->pipeline_rope_f16;
3941
+ return ctx->device->pipeline_rope_norm_f16;
3915
3942
  }
3916
3943
  }
3917
3944
  return nullptr;
@@ -3960,16 +3987,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
3960
3987
 
3961
3988
  template<typename PC>
3962
3989
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3963
- #ifdef GGML_VULKAN_DEBUG
3964
- std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3990
+ VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3965
3991
  if (src1 != nullptr) {
3966
3992
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3967
3993
  }
3968
3994
  if (src2 != nullptr) {
3969
3995
  std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3970
3996
  }
3971
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3972
- #endif
3997
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")");
3973
3998
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3974
3999
  GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
3975
4000
  GGML_ASSERT(dst->extra != nullptr);
@@ -4062,21 +4087,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4062
4087
  }
4063
4088
 
4064
4089
  GGML_ASSERT(d_D != nullptr);
4065
- uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4090
+ uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4066
4091
  GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
4067
4092
  if(!src0_uma) {
4068
4093
  d_X = extra_src0->buffer_gpu.lock();
4069
- x_buf_offset = extra_src0->offset;
4094
+ x_buf_offset = extra_src0->offset + src0->view_offs;
4070
4095
  GGML_ASSERT(d_X != nullptr);
4071
4096
  }
4072
4097
  if (use_src1 && !src1_uma) {
4073
4098
  d_Y = extra_src1->buffer_gpu.lock();
4074
- y_buf_offset = extra_src1->offset;
4099
+ y_buf_offset = extra_src1->offset + src1->view_offs;
4075
4100
  GGML_ASSERT(d_Y != nullptr);
4076
4101
  }
4077
4102
  if (use_src2 && !src2_uma) {
4078
4103
  d_Z = extra_src2->buffer_gpu.lock();
4079
- z_buf_offset = extra_src2->offset;
4104
+ z_buf_offset = extra_src2->offset + src2->view_offs;
4080
4105
  GGML_ASSERT(d_Z != nullptr);
4081
4106
  }
4082
4107
 
@@ -4155,24 +4180,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4155
4180
  ggml_vk_sync_buffers(subctx);
4156
4181
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4157
4182
  } else if (op == GGML_OP_ROPE) {
4158
- const int mode = ((int32_t *) dst->op_params)[2];
4159
- const bool is_neox = mode & 2;
4160
-
4161
- if (is_neox) {
4162
- // Empty src2 is possible in rope, but the shader needs a buffer
4163
- vk_subbuffer subbuf_z;
4164
- if (use_src2) {
4165
- subbuf_z = { d_Z, z_buf_offset, z_sz };
4166
- } else {
4167
- subbuf_z = { d_X, 0, d_X->size };
4168
- }
4169
-
4170
- ggml_vk_sync_buffers(subctx);
4171
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4183
+ // Empty src2 is possible in rope, but the shader needs a buffer
4184
+ vk_subbuffer subbuf_z;
4185
+ if (use_src2) {
4186
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
4172
4187
  } else {
4173
- ggml_vk_sync_buffers(subctx);
4174
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4188
+ subbuf_z = { d_X, 0, d_X->size };
4175
4189
  }
4190
+
4191
+ ggml_vk_sync_buffers(subctx);
4192
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4176
4193
  } else if (use_src2) {
4177
4194
  ggml_vk_sync_buffers(subctx);
4178
4195
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4336,7 +4353,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4336
4353
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4337
4354
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4338
4355
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4339
- const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4356
+ const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4340
4357
 
4341
4358
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4342
4359
  (uint32_t)ggml_nelements(src0),
@@ -4394,7 +4411,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4394
4411
 
4395
4412
  static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4396
4413
  const int n_dims = ((int32_t *) dst->op_params)[1];
4397
- const int mode = ((int32_t *) dst->op_params)[2];
4414
+ // const int mode = ((int32_t *) dst->op_params)[2];
4398
4415
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
4399
4416
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
4400
4417
  const float freq_base = ((float *) dst->op_params)[5];
@@ -4404,28 +4421,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4404
4421
  const float beta_fast = ((float *) dst->op_params)[9];
4405
4422
  const float beta_slow = ((float *) dst->op_params)[10];
4406
4423
 
4407
- const bool is_neox = mode & 2;
4408
-
4409
- #pragma message("TODO: update rope NORM mode to match NEOX mode")
4410
- #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
4411
-
4412
4424
  float corr_dims[2];
4413
4425
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
4414
4426
 
4415
- if (is_neox) {
4416
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
4417
- const float inv_ndims = -1.0f / n_dims;
4418
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4419
- (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4420
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
4421
- src2 != nullptr,
4422
- });
4423
- } else {
4424
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4425
- (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4426
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4427
- });
4428
- }
4427
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
4428
+
4429
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4430
+ (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4431
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
4432
+ src2 != nullptr,
4433
+ });
4429
4434
  }
4430
4435
 
4431
4436
  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -4487,9 +4492,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
4487
4492
 
4488
4493
  template <typename X_TYPE, typename Y_TYPE>
4489
4494
  static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
4490
- #ifdef GGML_VULKAN_DEBUG
4491
- std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl;
4492
- #endif
4495
+ VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")");
4493
4496
  const size_t x_ne = m * k * batch;
4494
4497
  const size_t y_ne = k * n * batch;
4495
4498
  const size_t d_ne = m * n * batch;
@@ -4903,9 +4906,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
4903
4906
  }
4904
4907
 
4905
4908
  static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
4906
- #ifdef GGML_VULKAN_DEBUG
4907
- std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
4908
- #endif
4909
+ VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
4909
4910
  // Check transfers are correct
4910
4911
  vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4911
4912
 
@@ -4989,9 +4990,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
4989
4990
  }
4990
4991
 
4991
4992
  static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
4992
- #ifdef GGML_VULKAN_DEBUG
4993
- std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
4994
- #endif
4993
+ VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
4995
4994
  const size_t x_sz = sizeof(float) * ne;
4996
4995
  const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
4997
4996
  const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
@@ -5068,9 +5067,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
5068
5067
  }
5069
5068
 
5070
5069
  static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
5071
- #ifdef GGML_VULKAN_DEBUG
5072
- std::cerr << "ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")" << std::endl;
5073
- #endif
5070
+ VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
5074
5071
  const size_t x_ne = m * k * batch;
5075
5072
  const size_t y_ne = k * n * batch;
5076
5073
  const size_t d_ne = m * n * batch;
@@ -5254,9 +5251,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5254
5251
  #endif
5255
5252
 
5256
5253
  static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
5257
- #ifdef GGML_VULKAN_DEBUG
5258
- std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
5259
- #endif
5254
+ VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
5260
5255
  ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
5261
5256
  extra->reset();
5262
5257
  tensor->extra = extra;
@@ -5264,9 +5259,7 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
5264
5259
  }
5265
5260
 
5266
5261
  static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
5267
- #ifdef GGML_VULKAN_DEBUG
5268
- std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
5269
- #endif
5262
+ VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
5270
5263
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5271
5264
 
5272
5265
  if (extra == nullptr) {
@@ -5301,7 +5294,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5301
5294
 
5302
5295
  bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
5303
5296
 
5304
- const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
5297
+ const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
5305
5298
  const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
5306
5299
 
5307
5300
  int split_k;
@@ -5379,9 +5372,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5379
5372
  }
5380
5373
 
5381
5374
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5382
- #ifdef GGML_VULKAN_DEBUG
5383
- std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
5384
- #endif
5385
5375
  #if defined(GGML_VULKAN_RUN_TESTS)
5386
5376
  ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
5387
5377
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
@@ -5520,6 +5510,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5520
5510
  #endif
5521
5511
 
5522
5512
  if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
5513
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
5523
5514
  // Resize buffer
5524
5515
  if (ctx->prealloc_x != nullptr) {
5525
5516
  ggml_vk_destroy_buffer(ctx->prealloc_x);
@@ -5527,6 +5518,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5527
5518
  ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x);
5528
5519
  }
5529
5520
  if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
5521
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")");
5530
5522
  // Resize buffer
5531
5523
  if (ctx->prealloc_y != nullptr) {
5532
5524
  ggml_vk_destroy_buffer(ctx->prealloc_y);
@@ -5534,6 +5526,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5534
5526
  ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y);
5535
5527
  }
5536
5528
  if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
5529
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
5537
5530
  // Resize buffer
5538
5531
  if (ctx->prealloc_split_k != nullptr) {
5539
5532
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
@@ -5541,6 +5534,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5541
5534
  ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k);
5542
5535
  }
5543
5536
  if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
5537
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
5544
5538
  // Resize buffer
5545
5539
  if (ctx->staging != nullptr) {
5546
5540
  ggml_vk_destroy_buffer(ctx->staging);
@@ -5558,9 +5552,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5558
5552
  return;
5559
5553
  }
5560
5554
 
5561
- #ifdef GGML_VULKAN_DEBUG
5562
- std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl;
5563
- #endif
5555
+ VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
5564
5556
  ctx->semaphore_idx = 0;
5565
5557
  ctx->staging_offset = 0;
5566
5558
 
@@ -5569,6 +5561,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5569
5561
  const ggml_tensor * src2 = node->src[2];
5570
5562
 
5571
5563
  switch (node->op) {
5564
+ // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
5565
+ case GGML_OP_RESHAPE:
5566
+ case GGML_OP_VIEW:
5567
+ case GGML_OP_PERMUTE:
5568
+ case GGML_OP_TRANSPOSE:
5569
+ case GGML_OP_NONE:
5570
+ return;
5572
5571
  case GGML_OP_UNARY:
5573
5572
  switch (ggml_get_unary_op(node)) {
5574
5573
  case GGML_UNARY_OP_SILU:
@@ -5590,10 +5589,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5590
5589
  case GGML_OP_CPY:
5591
5590
  case GGML_OP_CONT:
5592
5591
  case GGML_OP_DUP:
5593
- case GGML_OP_RESHAPE:
5594
- case GGML_OP_VIEW:
5595
- case GGML_OP_PERMUTE:
5596
- case GGML_OP_TRANSPOSE:
5597
5592
  case GGML_OP_NORM:
5598
5593
  case GGML_OP_RMS_NORM:
5599
5594
  case GGML_OP_DIAG_MASK_INF:
@@ -5601,7 +5596,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5601
5596
  case GGML_OP_ROPE:
5602
5597
  case GGML_OP_MUL_MAT:
5603
5598
  case GGML_OP_MUL_MAT_ID:
5604
- case GGML_OP_NONE:
5605
5599
  case GGML_OP_ARGSORT:
5606
5600
  case GGML_OP_SUM_ROWS:
5607
5601
  break;
@@ -5654,12 +5648,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5654
5648
  case GGML_OP_DUP:
5655
5649
  ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
5656
5650
 
5657
- break;
5658
- case GGML_OP_RESHAPE:
5659
- case GGML_OP_VIEW:
5660
- case GGML_OP_PERMUTE:
5661
- case GGML_OP_TRANSPOSE:
5662
- case GGML_OP_NONE:
5663
5651
  break;
5664
5652
  case GGML_OP_NORM:
5665
5653
  ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@@ -5712,7 +5700,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5712
5700
  return;
5713
5701
  }
5714
5702
 
5715
- extra->ready = true;
5716
5703
  extra->ctx_idx = ctx->compute_ctx->idx;
5717
5704
 
5718
5705
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5788,16 +5775,12 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5788
5775
  return true;
5789
5776
  }
5790
5777
 
5791
- #ifdef GGML_VULKAN_DEBUG
5792
- std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5793
- #endif
5778
+ VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
5794
5779
 
5795
5780
  #ifdef GGML_VULKAN_CHECK_RESULTS
5796
5781
  ggml_vk_check_results_0(ctx, params, tensor);
5797
5782
  #endif
5798
5783
 
5799
- GGML_ASSERT(extra->ready);
5800
-
5801
5784
  vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
5802
5785
 
5803
5786
  // Only run if ctx hasn't been submitted yet
@@ -5822,16 +5805,12 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5822
5805
  subctx.out_memcpys.clear();
5823
5806
  }
5824
5807
 
5825
- extra->ready = false;
5826
-
5827
5808
  return true;
5828
5809
  }
5829
5810
 
5830
5811
  // Clean up after graph processing is done
5831
5812
  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5832
- #ifdef GGML_VULKAN_DEBUG
5833
- std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
5834
- #endif
5813
+ VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
5835
5814
  for (auto& buffer : ctx->gc.temp_buffers) {
5836
5815
  ggml_vk_pool_free(ctx, buffer);
5837
5816
  }
@@ -5875,9 +5854,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5875
5854
 
5876
5855
  // Clean up on backend free
5877
5856
  static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5878
- #ifdef GGML_VULKAN_DEBUG
5879
- std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl;
5880
- #endif
5857
+ VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")");
5881
5858
  ggml_vk_graph_cleanup(ctx);
5882
5859
 
5883
5860
  ggml_vk_destroy_buffer(ctx->prealloc_x);
@@ -5943,7 +5920,9 @@ struct ggml_backend_vk_buffer_context {
5943
5920
 
5944
5921
  ~ggml_backend_vk_buffer_context() {
5945
5922
  ggml_vk_destroy_buffer(dev_buffer);
5946
- delete[] temp_tensor_extras;
5923
+ if (temp_tensor_extras != nullptr) {
5924
+ delete[] temp_tensor_extras;
5925
+ }
5947
5926
  }
5948
5927
 
5949
5928
  ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
@@ -5970,9 +5949,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
5970
5949
  }
5971
5950
 
5972
5951
  GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
5973
- #ifdef GGML_VULKAN_DEBUG
5974
- std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl;
5975
- #endif
5952
+ VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
5976
5953
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5977
5954
  ggml_vk_destroy_buffer(ctx->dev_buffer);
5978
5955
  delete ctx;
@@ -5985,49 +5962,41 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu
5985
5962
  }
5986
5963
 
5987
5964
  GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
5988
- #ifdef GGML_VULKAN_DEBUG
5989
- std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl;
5990
- #endif
5965
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
5991
5966
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5992
5967
 
5993
- ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
5994
- if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
5968
+ if (tensor->view_src != nullptr) {
5995
5969
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
5996
- ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra;
5997
- extra->buffer_gpu = extra_view->buffer_gpu;
5998
- extra->offset = extra_view->offset + tensor->view_offs;
5970
+ GGML_ASSERT(tensor->view_src->extra != nullptr);
5971
+ tensor->extra = tensor->view_src->extra;
5999
5972
  } else {
5973
+ ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
6000
5974
  extra->buffer_gpu = ctx->dev_buffer;
6001
5975
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5976
+ tensor->extra = extra;
6002
5977
  }
6003
-
6004
- tensor->extra = extra;
6005
5978
  }
6006
5979
 
6007
5980
  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6008
- #ifdef GGML_VULKAN_DEBUG
6009
- std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
6010
- #endif
5981
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6011
5982
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6012
5983
 
6013
5984
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6014
5985
 
6015
5986
  vk_buffer buf = extra->buffer_gpu.lock();
6016
5987
 
6017
- ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
5988
+ ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6018
5989
  }
6019
5990
 
6020
5991
  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6021
- #ifdef GGML_VULKAN_DEBUG
6022
- std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
6023
- #endif
5992
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6024
5993
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6025
5994
 
6026
5995
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6027
5996
 
6028
5997
  vk_buffer buf = extra->buffer_gpu.lock();
6029
5998
 
6030
- ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
5999
+ ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6031
6000
  }
6032
6001
 
6033
6002
  GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6038,7 +6007,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
6038
6007
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6039
6008
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6040
6009
 
6041
- ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
6010
+ ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6042
6011
 
6043
6012
  return true;
6044
6013
  }
@@ -6078,11 +6047,15 @@ GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buff
6078
6047
  }
6079
6048
 
6080
6049
  GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6081
- #ifdef GGML_VULKAN_DEBUG
6082
- std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6083
- #endif
6050
+ VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
6084
6051
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6085
- vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
6052
+
6053
+ vk_buffer dev_buffer = nullptr;
6054
+ try {
6055
+ dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
6056
+ } catch (const vk::SystemError& e) {
6057
+ return nullptr;
6058
+ }
6086
6059
 
6087
6060
  ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
6088
6061
 
@@ -6105,33 +6078,19 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
6105
6078
  UNUSED(buft);
6106
6079
  }
6107
6080
 
6108
- GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
6109
- if (!ggml_backend_is_vk(backend)) {
6110
- return false;
6111
- }
6112
-
6113
- ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6114
- ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6115
-
6116
- return buft_ctx->ctx->idx == ctx->idx;
6117
- }
6118
-
6119
6081
  static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6120
6082
  /* .get_name = */ ggml_backend_vk_buffer_type_name,
6121
6083
  /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
6122
6084
  /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
6123
6085
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
6124
6086
  /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
6125
- /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
6126
6087
  /* .is_host = */ NULL,
6127
6088
  };
6128
6089
 
6129
6090
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6130
6091
  ggml_vk_instance_init();
6131
6092
 
6132
- #ifdef GGML_VULKAN_DEBUG
6133
- std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
6134
- #endif
6093
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
6135
6094
 
6136
6095
  GGML_ASSERT(dev_num < vk_instance.device_indices.size());
6137
6096
 
@@ -6155,16 +6114,12 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff
6155
6114
  }
6156
6115
 
6157
6116
  GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6158
- #ifdef GGML_VULKAN_DEBUG
6159
- std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl;
6160
- #endif
6117
+ VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
6161
6118
  ggml_vk_host_free(&vk_instance.contexts[0], buffer->context);
6162
6119
  }
6163
6120
 
6164
6121
  GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6165
- #ifdef GGML_VULKAN_DEBUG
6166
- std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6167
- #endif
6122
+ VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
6168
6123
  size += 32; // Behave like the CPU buffer type
6169
6124
  void * ptr = nullptr;
6170
6125
  try {
@@ -6198,7 +6153,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6198
6153
  /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
6199
6154
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
6200
6155
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
6201
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
6202
6156
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
6203
6157
  },
6204
6158
  /* .context = */ nullptr,
@@ -6222,9 +6176,7 @@ GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
6222
6176
 
6223
6177
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
6224
6178
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6225
- #ifdef GGML_VULKAN_DEBUG
6226
- std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl;
6227
- #endif
6179
+ VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
6228
6180
 
6229
6181
  size_t idx = ctx->idx;
6230
6182
 
@@ -6248,9 +6200,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t
6248
6200
  }
6249
6201
 
6250
6202
  GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6251
- #ifdef GGML_VULKAN_DEBUG
6252
- std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
6253
- #endif
6203
+ VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
6254
6204
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6255
6205
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6256
6206
 
@@ -6264,13 +6214,11 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6264
6214
 
6265
6215
  vk_buffer buf = extra->buffer_gpu.lock();
6266
6216
 
6267
- ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
6217
+ ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6268
6218
  }
6269
6219
 
6270
6220
  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6271
- #ifdef GGML_VULKAN_DEBUG
6272
- std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
6273
- #endif
6221
+ VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
6274
6222
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6275
6223
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6276
6224
 
@@ -6284,13 +6232,11 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6284
6232
 
6285
6233
  vk_buffer buf = extra->buffer_gpu.lock();
6286
6234
 
6287
- ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
6235
+ ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6288
6236
  }
6289
6237
 
6290
6238
  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6291
- #ifdef GGML_VULKAN_DEBUG
6292
- std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
6293
- #endif
6239
+ VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
6294
6240
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6295
6241
  if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
6296
6242
  ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
@@ -6305,7 +6251,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6305
6251
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6306
6252
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6307
6253
 
6308
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
6254
+ ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6309
6255
  return true;
6310
6256
  }
6311
6257
 
@@ -6313,9 +6259,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6313
6259
  }
6314
6260
 
6315
6261
  GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6316
- #ifdef GGML_VULKAN_DEBUG
6317
- std::cerr << "ggml_backend_vk_synchronize()" << std::endl;
6318
- #endif
6262
+ VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
6319
6263
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6320
6264
  if(ctx->transfer_ctx == nullptr) {
6321
6265
  return;
@@ -6343,9 +6287,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
6343
6287
  }
6344
6288
 
6345
6289
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6346
- #ifdef GGML_VULKAN_DEBUG
6347
- std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
6348
- #endif
6290
+ VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
6349
6291
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6350
6292
 
6351
6293
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -6402,7 +6344,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6402
6344
  case GGML_UNARY_OP_GELU:
6403
6345
  case GGML_UNARY_OP_SILU:
6404
6346
  case GGML_UNARY_OP_RELU:
6405
- return true;
6347
+ return ggml_is_contiguous(op->src[0]);
6406
6348
  default:
6407
6349
  return false;
6408
6350
  }
@@ -6478,11 +6420,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6478
6420
  // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
6479
6421
  // } break;
6480
6422
  case GGML_OP_ROPE:
6481
- {
6482
- const int mode = ((const int32_t *) op->op_params)[2];
6483
-
6484
- return true;
6485
- } break;
6423
+ return ggml_is_contiguous(op->src[0]);
6486
6424
  case GGML_OP_NONE:
6487
6425
  case GGML_OP_RESHAPE:
6488
6426
  case GGML_OP_VIEW:
@@ -6518,6 +6456,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
6518
6456
  UNUSED(backend);
6519
6457
  }
6520
6458
 
6459
+ GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
6460
+ if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
6461
+ return false;
6462
+ }
6463
+
6464
+ ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6465
+ ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6466
+
6467
+ return buft_ctx->ctx->idx == ctx->idx;
6468
+ }
6469
+
6521
6470
  // TODO: enable async and synchronize
6522
6471
  static ggml_backend_i ggml_backend_vk_interface = {
6523
6472
  /* .get_name = */ ggml_backend_vk_name,
@@ -6529,9 +6478,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
6529
6478
  /* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
6530
6479
  /* .graph_plan_create = */ NULL,
6531
6480
  /* .graph_plan_free = */ NULL,
6481
+ /* .graph_plan_update = */ NULL,
6532
6482
  /* .graph_plan_compute = */ NULL,
6533
6483
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
6534
6484
  /* .supports_op = */ ggml_backend_vk_supports_op,
6485
+ /* .supports_buft = */ ggml_backend_vk_supports_buft,
6535
6486
  /* .offload_op = */ ggml_backend_vk_offload_op,
6536
6487
  /* .event_new = */ NULL,
6537
6488
  /* .event_free = */ NULL,
@@ -6549,9 +6500,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6549
6500
  if (vk_instance.initialized[dev_num]) {
6550
6501
  return vk_instance.backends[dev_num];
6551
6502
  }
6552
- #ifdef GGML_VULKAN_DEBUG
6553
- std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
6554
- #endif
6503
+ VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
6555
6504
 
6556
6505
  ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
6557
6506
  ggml_vk_init(ctx, dev_num);
@@ -6725,7 +6674,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6725
6674
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6726
6675
 
6727
6676
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6728
- ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
6677
+ ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
6729
6678
  }
6730
6679
 
6731
6680
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@@ -6767,9 +6716,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6767
6716
  return;
6768
6717
  }
6769
6718
 
6770
- #ifdef GGML_VULKAN_DEBUG
6771
- std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
6772
- #endif
6719
+ VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
6773
6720
 
6774
6721
  ggml_tensor * src0 = tensor->src[0];
6775
6722
  ggml_tensor * src1 = tensor->src[1];
@@ -6809,7 +6756,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6809
6756
  } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6810
6757
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6811
6758
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6812
- uint64_t offset = extra->offset;
6759
+ uint64_t offset = extra->offset + src0->view_offs;
6813
6760
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
6814
6761
  for (int i3 = 0; i3 < src0->ne[3]; i3++) {
6815
6762
  for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@@ -6851,7 +6798,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6851
6798
  } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6852
6799
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6853
6800
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6854
- uint64_t offset = extra->offset;
6801
+ uint64_t offset = extra->offset + src1->view_offs;
6855
6802
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
6856
6803
  for (int i3 = 0; i3 < src1->ne[3]; i3++) {
6857
6804
  for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@@ -6909,7 +6856,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6909
6856
  } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6910
6857
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6911
6858
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6912
- uint64_t offset = extra->offset;
6859
+ uint64_t offset = extra->offset + src2->view_offs;
6913
6860
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6914
6861
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6915
6862
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@@ -7075,9 +7022,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7075
7022
  return;
7076
7023
  }
7077
7024
 
7078
- #ifdef GGML_VULKAN_DEBUG
7079
- std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
7080
- #endif
7025
+ VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")");
7081
7026
 
7082
7027
  ggml_tensor * src0 = tensor->src[0];
7083
7028
  ggml_tensor * src1 = tensor->src[1];
@@ -7092,11 +7037,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7092
7037
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7093
7038
 
7094
7039
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7095
- if (extra->offset + tensor_size >= buffer_gpu->size) {
7096
- tensor_size = buffer_gpu->size - (extra->offset);
7040
+ if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
7041
+ tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
7097
7042
  }
7098
7043
 
7099
- ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
7044
+ ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
7100
7045
  }
7101
7046
 
7102
7047
  float first_error_result = -1.0f;