llama_cpp 0.16.0 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/ext/llama_cpp/extconf.rb +3 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +14 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +4 -0
  7. data/vendor/tmp/llama.cpp/Makefile +119 -54
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
  126. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  127. data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
  128. data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
  129. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
  130. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
  131. data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
  132. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
  133. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
  134. data/vendor/tmp/llama.cpp/ggml.c +158 -414
  135. data/vendor/tmp/llama.cpp/ggml.h +6 -0
  136. data/vendor/tmp/llama.cpp/llama.cpp +628 -279
  137. data/vendor/tmp/llama.cpp/llama.h +9 -1
  138. data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
  139. data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
  140. data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
  141. data/vendor/tmp/llama.cpp/unicode.h +1 -1
  142. metadata +15 -3
@@ -1,5 +1,5 @@
1
1
  #include "ggml-vulkan.h"
2
-
2
+ #include <vulkan/vulkan_core.h>
3
3
  #ifdef GGML_VULKAN_RUN_TESTS
4
4
  #include <chrono>
5
5
  #endif
@@ -8,13 +8,15 @@
8
8
 
9
9
  #include <algorithm>
10
10
  #include <cmath>
11
+ #include <iomanip>
11
12
  #include <iostream>
12
- #include <limits>
13
13
  #include <tuple>
14
14
  #include <vector>
15
15
  #include <sstream>
16
16
  #include <utility>
17
17
  #include <memory>
18
+ #include <limits>
19
+ #include <map>
18
20
 
19
21
  #include "ggml.h"
20
22
  #include "ggml-backend-impl.h"
@@ -56,6 +58,12 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
56
58
  } \
57
59
  } while (0)
58
60
 
61
+ #ifdef GGML_VULKAN_DEBUG
62
+ #define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl
63
+ #else
64
+ #define VK_LOG_DEBUG(msg) ((void) 0)
65
+ #endif // GGML_VULKAN_DEBUG
66
+
59
67
  struct ggml_backend_vk_context;
60
68
 
61
69
  struct vk_queue {
@@ -150,7 +158,7 @@ struct vk_device {
150
158
  vk_pipeline pipeline_relu_f32;
151
159
  vk_pipeline pipeline_diag_mask_inf_f32;
152
160
  vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
153
- vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
161
+ vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
154
162
  vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
155
163
  vk_pipeline pipeline_argsort_f32;
156
164
  vk_pipeline pipeline_sum_rows_f32;
@@ -158,9 +166,7 @@ struct vk_device {
158
166
  std::vector<vk_pipeline_ref> pipelines;
159
167
 
160
168
  ~vk_device() {
161
- #ifdef GGML_VULKAN_DEBUG
162
- std::cerr << "destroy device " << name << std::endl;
163
- #endif
169
+ VK_LOG_DEBUG("destroy device " << name);
164
170
  device.destroyCommandPool(compute_queue.pool);
165
171
  if (!single_queue) {
166
172
  device.destroyCommandPool(transfer_queue.pool);
@@ -195,9 +201,7 @@ struct vk_buffer_struct {
195
201
  if (size == 0) {
196
202
  return;
197
203
  }
198
- #ifdef GGML_VULKAN_DEBUG
199
- std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl;
200
- #endif
204
+ VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")");
201
205
 
202
206
  device->device.freeMemory(device_memory);
203
207
  device->device.destroyBuffer(buffer);
@@ -283,26 +287,15 @@ struct vk_op_diag_mask_push_constants {
283
287
 
284
288
  struct vk_op_rope_push_constants {
285
289
  uint32_t ncols;
290
+ uint32_t n_dims;
286
291
  float freq_scale;
287
292
  uint32_t p_delta_rows;
288
293
  float freq_base;
289
294
  float ext_factor;
290
295
  float attn_factor;
291
- float corr_dims[4];
292
- };
293
-
294
- struct vk_op_rope_neox_push_constants {
295
- uint32_t ncols;
296
- uint32_t ndims;
297
- float freq_scale;
298
- uint32_t p_delta_rows;
299
- float freq_base;
300
- float ext_factor;
301
- float attn_factor;
302
- float corr_dims[4];
296
+ float corr_dims[2];
303
297
  float theta_scale;
304
- float inv_ndims;
305
- uint32_t has_freq_facs;
298
+ uint32_t has_ff;
306
299
  };
307
300
 
308
301
  struct vk_op_soft_max_push_constants {
@@ -345,15 +338,12 @@ struct vk_context {
345
338
  };
346
339
 
347
340
  struct ggml_tensor_extra_gpu {
348
- bool ready;
349
-
350
341
  size_t ctx_idx;
351
342
 
352
343
  vk_buffer_ref buffer_gpu;
353
344
  uint64_t offset;
354
345
 
355
346
  void reset() {
356
- ready = false;
357
347
  ctx_idx = 0;
358
348
  buffer_gpu.reset();
359
349
  offset = 0;
@@ -368,6 +358,49 @@ struct ggml_vk_garbage_collector {
368
358
  std::vector<vk_context> contexts;
369
359
  };
370
360
 
361
+ #if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
362
+ #include <mutex>
363
+
364
+ #define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
365
+
366
+ static std::string format_size(size_t size) {
367
+ const size_t kib = 1024;
368
+ const size_t mib = kib * 1024;
369
+ const size_t gib = mib * 1024;
370
+
371
+ std::ostringstream oss;
372
+ oss << std::fixed << std::setprecision(2);
373
+
374
+ if (size >= gib) {
375
+ oss << static_cast<double>(size) / gib << " GiB";
376
+ } else if (size >= mib) {
377
+ oss << static_cast<double>(size) / mib << " MiB";
378
+ } else if (size >= kib) {
379
+ oss << static_cast<double>(size) / kib << " KiB";
380
+ } else {
381
+ oss << size << " B";
382
+ }
383
+
384
+ return oss.str();
385
+ }
386
+
387
+ static std::mutex log_mutex;
388
+
389
+ class vk_memory_logger {
390
+ public:
391
+ vk_memory_logger(): total_device(0), total_host(0) {}
392
+ void log_allocation(vk_buffer_ref buf_ref, size_t size);
393
+ void log_deallocation(vk_buffer_ref buf_ref);
394
+
395
+ private:
396
+ std::map<vk::Buffer, size_t> allocations; // Track allocations
397
+ size_t total_device;
398
+ size_t total_host;
399
+ };
400
+ #else
401
+ #define VK_LOG_MEMORY(msg) ((void) 0)
402
+ #endif // GGML_VULKAN_MEMORY_DEBUG
403
+
371
404
  struct ggml_backend_vk_context {
372
405
  std::string name;
373
406
 
@@ -392,8 +425,45 @@ struct ggml_backend_vk_context {
392
425
  bool initialized;
393
426
 
394
427
  size_t idx;
428
+
429
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
430
+ vk_memory_logger memory_logger;
431
+ #endif
395
432
  };
396
433
 
434
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
435
+ void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
436
+ std::lock_guard<std::mutex> guard(log_mutex);
437
+ vk_buffer buf = buf_ref.lock();
438
+ const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
439
+ const std::string type = device ? "device" : "host";
440
+ allocations[buf->buffer] = size;
441
+ total_device += device ? size : 0;
442
+ total_host += device ? 0 : size;
443
+ VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
444
+ }
445
+
446
+ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
447
+ if (buf_ref.expired() || buf_ref.lock()->size == 0) {
448
+ return;
449
+ }
450
+
451
+ std::lock_guard<std::mutex> guard(log_mutex);
452
+ vk_buffer buf = buf_ref.lock();
453
+ const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
454
+ std::string type = device ? "device" : "host";
455
+ auto it = allocations.find(buf->buffer);
456
+ total_device -= device ? it->second : 0;
457
+ total_host -= device ? 0 : it->second;
458
+ if (it != allocations.end()) {
459
+ VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
460
+ allocations.erase(it);
461
+ } else {
462
+ VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
463
+ }
464
+ }
465
+ #endif // GGML_VULKAN_MEMORY_DEBUG
466
+
397
467
  struct vk_instance_t {
398
468
  vk::Instance instance;
399
469
 
@@ -406,15 +476,11 @@ struct vk_instance_t {
406
476
  };
407
477
 
408
478
  static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
409
- #ifdef GGML_VULKAN_DEBUG
410
- std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl;
411
- #endif
479
+ VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
412
480
  static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
413
481
 
414
482
  if (devices[idx].expired()) {
415
- #ifdef GGML_VULKAN_DEBUG
416
- std::cerr << "Initializing new vk_device" << std::endl;
417
- #endif
483
+ VK_LOG_DEBUG("Initializing new vk_device");
418
484
  std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
419
485
  device->initialized = false;
420
486
  devices[idx] = device;
@@ -441,9 +507,7 @@ static vk_instance_t vk_instance;
441
507
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
442
508
 
443
509
  static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
444
- #ifdef GGML_VULKAN_DEBUG
445
- std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
446
- #endif
510
+ VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
447
511
  GGML_ASSERT(parameter_count > 0);
448
512
  GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
449
513
 
@@ -544,9 +608,7 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
544
608
  }
545
609
 
546
610
  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
547
- #ifdef GGML_VULKAN_DEBUG
548
- std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl;
549
- #endif
611
+ VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
550
612
  for (auto& pool : pipeline->descriptor_pools) {
551
613
  device.destroyDescriptorPool(pool);
552
614
  }
@@ -564,9 +626,7 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
564
626
  }
565
627
 
566
628
  static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
567
- #ifdef GGML_VULKAN_DEBUG
568
- std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl;
569
- #endif
629
+ VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
570
630
  if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
571
631
  // Enough descriptors are available
572
632
  return;
@@ -596,16 +656,12 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
596
656
  }
597
657
 
598
658
  static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
599
- #ifdef GGML_VULKAN_DEBUG
600
- std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl;
601
- #endif
659
+ VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
602
660
  pipeline->descriptor_set_idx = 0;
603
661
  }
604
662
 
605
663
  static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
606
- #ifdef GGML_VULKAN_DEBUG
607
- std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl;
608
- #endif
664
+ VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
609
665
  if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
610
666
  // Reuse command buffer
611
667
  return q.cmd_buffers[q.cmd_buffer_idx++];
@@ -625,9 +681,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx
625
681
  }
626
682
 
627
683
  static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
628
- #ifdef GGML_VULKAN_DEBUG
629
- std::cerr << "ggml_vk_create_submission()" << std::endl;
630
- #endif
684
+ VK_LOG_DEBUG("ggml_vk_create_submission()");
631
685
  vk_submission s;
632
686
  s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
633
687
  s.wait_semaphores = std::move(wait_semaphores);
@@ -636,9 +690,7 @@ static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk
636
690
  }
637
691
 
638
692
  static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
639
- #ifdef GGML_VULKAN_DEBUG
640
- std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl;
641
- #endif
693
+ VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
642
694
  if (ctx->seqs.empty()) {
643
695
  return;
644
696
  }
@@ -712,9 +764,7 @@ static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
712
764
  }
713
765
 
714
766
  static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
715
- #ifdef GGML_VULKAN_DEBUG
716
- std::cerr << "ggml_vk_find_queue_family_index()" << std::endl;
717
- #endif
767
+ VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
718
768
  const uint32_t qfsize = queue_family_props.size();
719
769
 
720
770
  // Try with avoid preferences first
@@ -760,9 +810,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
760
810
  }
761
811
 
762
812
  static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
763
- #ifdef GGML_VULKAN_DEBUG
764
- std::cerr << "ggml_vk_create_queue()" << std::endl;
765
- #endif
813
+ VK_LOG_DEBUG("ggml_vk_create_queue()");
766
814
  q.queue_family_index = queue_family_index;
767
815
 
768
816
  vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
@@ -776,9 +824,7 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin
776
824
  }
777
825
 
778
826
  static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
779
- #ifdef GGML_VULKAN_DEBUG
780
- std::cerr << "ggml_vk_create_context()" << std::endl;
781
- #endif
827
+ VK_LOG_DEBUG("ggml_vk_create_context()");
782
828
  ctx->gc.contexts.emplace_back();
783
829
  vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
784
830
  memset((void *) result, 0, sizeof(vk_context));
@@ -788,9 +834,7 @@ static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_que
788
834
  }
789
835
 
790
836
  static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
791
- #ifdef GGML_VULKAN_DEBUG
792
- std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
793
- #endif
837
+ VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
794
838
  vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
795
839
  vk::SemaphoreCreateInfo ci{};
796
840
  ci.setPNext(&tci);
@@ -800,9 +844,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context *
800
844
  }
801
845
 
802
846
  static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
803
- #ifdef GGML_VULKAN_DEBUG
804
- std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
805
- #endif
847
+ VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
806
848
  if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
807
849
  vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
808
850
  vk::SemaphoreCreateInfo ci{};
@@ -821,9 +863,7 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
821
863
  }
822
864
 
823
865
  static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
824
- #ifdef GGML_VULKAN_DEBUG
825
- std::cerr << "ggml_vk_queue_cleanup()" << std::endl;
826
- #endif
866
+ VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
827
867
  // Requires command buffers to be done
828
868
 
829
869
  ctx->device->device.resetCommandPool(q.pool);
@@ -843,9 +883,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
843
883
  }
844
884
 
845
885
  static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
846
- #ifdef GGML_VULKAN_DEBUG
847
- std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
848
- #endif
886
+ VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
849
887
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
850
888
 
851
889
  if (size == 0) {
@@ -905,8 +943,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
905
943
 
906
944
  buf->device = ctx->device;
907
945
 
908
- #ifdef GGML_VULKAN_DEBUG
909
- std::cerr << "Created buffer " << buf->buffer << std::endl;
946
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
947
+ ctx->memory_logger.log_allocation(buf, size);
910
948
  #endif
911
949
 
912
950
  return buf;
@@ -941,6 +979,14 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz
941
979
  }
942
980
 
943
981
  static void ggml_vk_destroy_buffer(vk_buffer& buf) {
982
+ if (buf == nullptr) {
983
+ return;
984
+ }
985
+
986
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
987
+ buf->ctx->memory_logger.log_deallocation(buf);
988
+ #endif
989
+
944
990
  buf.reset();
945
991
  }
946
992
 
@@ -949,9 +995,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
949
995
  }
950
996
 
951
997
  static void ggml_vk_sync_buffers(vk_context * ctx) {
952
- #ifdef GGML_VULKAN_DEBUG
953
- std::cerr << "ggml_vk_sync_buffers()" << std::endl;
954
- #endif
998
+ VK_LOG_DEBUG("ggml_vk_sync_buffers()");
955
999
  const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
956
1000
 
957
1001
  ctx->s->buffer.pipelineBarrier(
@@ -965,9 +1009,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) {
965
1009
  }
966
1010
 
967
1011
  static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
968
- #ifdef GGML_VULKAN_DEBUG
969
- std::cerr << "ggml_vk_wait_events()" << std::endl;
970
- #endif
1012
+ VK_LOG_DEBUG("ggml_vk_wait_events()");
971
1013
  if (events.empty()) {
972
1014
  return;
973
1015
  }
@@ -1002,9 +1044,7 @@ static bool ggml_vk_build_shader(ggml_type type) {
1002
1044
  }
1003
1045
 
1004
1046
  static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1005
- #ifdef GGML_VULKAN_DEBUG
1006
- std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
1007
- #endif
1047
+ VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")");
1008
1048
 
1009
1049
  const std::shared_ptr<vk_device> device = ctx->device;
1010
1050
 
@@ -1055,12 +1095,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1055
1095
  ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1056
1096
 
1057
1097
  if (device->fp16) {
1058
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1059
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1060
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1061
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1062
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1063
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1098
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1099
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1100
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1101
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1102
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1103
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1064
1104
 
1065
1105
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1066
1106
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1153,12 +1193,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1153
1193
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1154
1194
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1155
1195
 
1156
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1157
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1158
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1159
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1160
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1161
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1196
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1197
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1198
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1199
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1200
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1201
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1162
1202
 
1163
1203
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1164
1204
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1244,12 +1284,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1244
1284
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1245
1285
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1246
1286
  } else {
1247
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1248
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1249
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1250
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1251
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1252
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1287
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1288
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1289
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1290
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1291
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1292
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1253
1293
 
1254
1294
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1255
1295
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1342,12 +1382,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1342
1382
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1343
1383
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1344
1384
 
1345
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1346
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1347
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1348
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1349
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1350
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1385
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1386
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1387
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1388
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1389
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1390
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1351
1391
 
1352
1392
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1353
1393
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1442,11 +1482,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1442
1482
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1443
1483
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1444
1484
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1445
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32_f32", mul_mat_vec_q2_K_f32_f32_len, mul_mat_vec_q2_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1446
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32_f32", mul_mat_vec_q3_K_f32_f32_len, mul_mat_vec_q3_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1447
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32_f32", mul_mat_vec_q4_K_f32_f32_len, mul_mat_vec_q4_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1448
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32_f32", mul_mat_vec_q5_K_f32_f32_len, mul_mat_vec_q5_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1449
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32_f32", mul_mat_vec_q6_K_f32_f32_len, mul_mat_vec_q6_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1485
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1486
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1487
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1488
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1489
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1450
1490
 
1451
1491
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1452
1492
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1455,11 +1495,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1455
1495
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1456
1496
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1457
1497
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1458
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f16_f32", mul_mat_vec_q2_K_f16_f32_len, mul_mat_vec_q2_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1459
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f16_f32", mul_mat_vec_q3_K_f16_f32_len, mul_mat_vec_q3_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1460
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f16_f32", mul_mat_vec_q4_K_f16_f32_len, mul_mat_vec_q4_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1461
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f16_f32", mul_mat_vec_q5_K_f16_f32_len, mul_mat_vec_q5_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1462
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f16_f32", mul_mat_vec_q6_K_f16_f32_len, mul_mat_vec_q6_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1498
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1499
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1500
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1501
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1502
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1463
1503
 
1464
1504
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1465
1505
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1468,11 +1508,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1468
1508
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1469
1509
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1470
1510
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1471
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_K_f32", mul_mat_vec_id_q2_K_f32_len, mul_mat_vec_id_q2_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1472
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_K_f32", mul_mat_vec_id_q3_K_f32_len, mul_mat_vec_id_q3_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1473
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_K_f32", mul_mat_vec_id_q4_K_f32_len, mul_mat_vec_id_q4_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1474
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_K_f32", mul_mat_vec_id_q5_K_f32_len, mul_mat_vec_id_q5_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1475
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_K_f32", mul_mat_vec_id_q6_K_f32_len, mul_mat_vec_id_q6_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1511
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1512
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1513
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1514
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1515
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1476
1516
 
1477
1517
  // dequant shaders
1478
1518
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -1481,11 +1521,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1481
1521
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1482
1522
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1483
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1484
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1485
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1486
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1487
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1488
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1524
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1525
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1526
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1528
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1489
1529
 
1490
1530
  // get_rows
1491
1531
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -1537,11 +1577,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1537
1577
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1538
1578
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1539
1579
 
1540
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1541
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1580
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1581
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1542
1582
 
1543
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1544
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1583
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1584
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1545
1585
 
1546
1586
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1547
1587
 
@@ -1551,9 +1591,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1551
1591
  static void ggml_vk_print_gpu_info(size_t idx) {
1552
1592
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1553
1593
  size_t dev_num = vk_instance.device_indices[idx];
1554
- #ifdef GGML_VULKAN_DEBUG
1555
- std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl;
1556
- #endif
1594
+ VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")");
1557
1595
  GGML_ASSERT(vk_instance.initialized);
1558
1596
 
1559
1597
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -1569,8 +1607,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1569
1607
  vk::PhysicalDeviceProperties2 props2;
1570
1608
  vk::PhysicalDeviceMaintenance3Properties props3;
1571
1609
  vk::PhysicalDeviceSubgroupProperties subgroup_props;
1610
+ vk::PhysicalDeviceDriverProperties driver_props;
1572
1611
  props2.pNext = &props3;
1573
1612
  props3.pNext = &subgroup_props;
1613
+ subgroup_props.pNext = &driver_props;
1574
1614
  physical_device.getProperties2(&props2);
1575
1615
 
1576
1616
  const size_t subgroup_size = subgroup_props.subgroupSize;
@@ -1614,7 +1654,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1614
1654
  fp16 = fp16 && vk12_features.shaderFloat16;
1615
1655
 
1616
1656
  std::string device_name = props2.properties.deviceName.data();
1617
- std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1657
+ std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1618
1658
 
1619
1659
  if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
1620
1660
  std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
@@ -1628,9 +1668,7 @@ void ggml_vk_instance_init() {
1628
1668
  if (vk_instance_initialized) {
1629
1669
  return;
1630
1670
  }
1631
- #ifdef GGML_VULKAN_DEBUG
1632
- std::cerr << "ggml_vk_instance_init()" << std::endl;
1633
- #endif
1671
+ VK_LOG_DEBUG("ggml_vk_instance_init()");
1634
1672
 
1635
1673
  vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
1636
1674
 
@@ -1707,10 +1745,80 @@ void ggml_vk_instance_init() {
1707
1745
 
1708
1746
  // Default to using all dedicated GPUs
1709
1747
  for (size_t i = 0; i < devices.size(); i++) {
1710
- vk::PhysicalDeviceProperties props = devices[i].getProperties();
1748
+ vk::PhysicalDeviceProperties2 new_props;
1749
+ vk::PhysicalDeviceDriverProperties new_driver;
1750
+ vk::PhysicalDeviceIDProperties new_id;
1751
+ new_props.pNext = &new_driver;
1752
+ new_driver.pNext = &new_id;
1753
+ devices[i].getProperties2(&new_props);
1754
+
1755
+ if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1756
+ // Check if there are two physical devices corresponding to the same GPU
1757
+ auto old_device = std::find_if(
1758
+ vk_instance.device_indices.begin(),
1759
+ vk_instance.device_indices.end(),
1760
+ [&devices, &new_id](const size_t k){
1761
+ vk::PhysicalDeviceProperties2 old_props;
1762
+ vk::PhysicalDeviceIDProperties old_id;
1763
+ old_props.pNext = &old_id;
1764
+ devices[k].getProperties2(&old_props);
1765
+ return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
1766
+ }
1767
+ );
1768
+ if (old_device == vk_instance.device_indices.end()) {
1769
+ vk_instance.device_indices.push_back(i);
1770
+ } else {
1771
+ // There can be two physical devices corresponding to the same GPU if there are 2 different drivers
1772
+ // This can cause error when splitting layers aross the devices, need to keep only 1
1773
+ VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID");
1774
+
1775
+ vk::PhysicalDeviceProperties2 old_props;
1776
+ vk::PhysicalDeviceDriverProperties old_driver;
1777
+ old_props.pNext = &old_driver;
1778
+ devices[*old_device].getProperties2(&old_props);
1779
+
1780
+ std::map<vk::DriverId, int> driver_priorities {};
1781
+ int old_priority = std::numeric_limits<int>::max();
1782
+ int new_priority = std::numeric_limits<int>::max();
1783
+
1784
+ // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
1785
+ // Smaller number -> higher priority
1786
+ switch (old_props.properties.vendorID) {
1787
+ case VK_VENDOR_ID_AMD:
1788
+ driver_priorities[vk::DriverId::eMesaRadv] = 1;
1789
+ driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
1790
+ driver_priorities[vk::DriverId::eAmdProprietary] = 3;
1791
+ break;
1792
+ case VK_VENDOR_ID_INTEL:
1793
+ driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
1794
+ driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
1795
+ break;
1796
+ case VK_VENDOR_ID_NVIDIA:
1797
+ driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
1798
+ #if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
1799
+ driver_priorities[vk::DriverId::eMesaNvk] = 2;
1800
+ #endif
1801
+ break;
1802
+ }
1803
+
1804
+ if (driver_priorities.count(old_driver.driverID)) {
1805
+ old_priority = driver_priorities[old_driver.driverID];
1806
+ }
1807
+ if (driver_priorities.count(new_driver.driverID)) {
1808
+ new_priority = driver_priorities[new_driver.driverID];
1809
+ }
1810
+
1811
+ if (new_priority < old_priority) {
1812
+ auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
1813
+ vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
1814
+ vk_instance.device_indices.push_back(i);
1711
1815
 
1712
- if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1713
- vk_instance.device_indices.push_back(i);
1816
+ VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName);
1817
+ }
1818
+ else {
1819
+ VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl);
1820
+ }
1821
+ }
1714
1822
  }
1715
1823
  }
1716
1824
 
@@ -1732,9 +1840,7 @@ void ggml_vk_instance_init() {
1732
1840
  static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1733
1841
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1734
1842
  size_t dev_num = vk_instance.device_indices[idx];
1735
- #ifdef GGML_VULKAN_DEBUG
1736
- std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl;
1737
- #endif
1843
+ VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")");
1738
1844
  ggml_vk_instance_init();
1739
1845
 
1740
1846
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -1907,9 +2013,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1907
2013
  }
1908
2014
 
1909
2015
  static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
1910
- #ifdef GGML_VULKAN_DEBUG
1911
- std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
1912
- #endif
2016
+ VK_LOG_DEBUG("ggml_vk_get_to_fp16()");
1913
2017
  switch (type) {
1914
2018
  case GGML_TYPE_F32:
1915
2019
  case GGML_TYPE_Q4_0:
@@ -1931,9 +2035,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
1931
2035
  }
1932
2036
 
1933
2037
  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
1934
- #ifdef GGML_VULKAN_DEBUG
1935
- std::cerr << "ggml_vk_get_mul_mat_mat_pipeline()" << std::endl;
1936
- #endif
2038
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
1937
2039
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
1938
2040
  return ctx->device->pipeline_matmul_f32;
1939
2041
  }
@@ -1969,9 +2071,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
1969
2071
  }
1970
2072
 
1971
2073
  static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
1972
- #ifdef GGML_VULKAN_DEBUG
1973
- std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
1974
- #endif
2074
+ VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
1975
2075
  GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
1976
2076
 
1977
2077
  switch (a_type) {
@@ -1996,9 +2096,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
1996
2096
  }
1997
2097
 
1998
2098
  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
1999
- #ifdef GGML_VULKAN_DEBUG
2000
- std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl;
2001
- #endif
2099
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()");
2002
2100
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
2003
2101
  return ctx->device->pipeline_matmul_id_f32;
2004
2102
  }
@@ -2031,9 +2129,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
2031
2129
  }
2032
2130
 
2033
2131
  static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
2034
- #ifdef GGML_VULKAN_DEBUG
2035
- std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
2036
- #endif
2132
+ VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
2037
2133
  GGML_ASSERT(b_type == GGML_TYPE_F32);
2038
2134
 
2039
2135
  switch (a_type) {
@@ -2058,9 +2154,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
2058
2154
  }
2059
2155
 
2060
2156
  static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
2061
- #ifdef GGML_VULKAN_DEBUG
2062
- std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl;
2063
- #endif
2157
+ VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
2158
+ VK_LOG_MEMORY("ggml_vk_pool_malloc");
2159
+
2064
2160
  int best_i = -1;
2065
2161
  size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
2066
2162
  int worst_i = -1;
@@ -2088,13 +2184,11 @@ static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size)
2088
2184
  ggml_vk_destroy_buffer(b);
2089
2185
  }
2090
2186
 
2091
- return ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
2187
+ return ggml_vk_create_buffer_device(ctx, size);
2092
2188
  }
2093
2189
 
2094
2190
  static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
2095
- #ifdef GGML_VULKAN_DEBUG
2096
- std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl;
2097
- #endif
2191
+ VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
2098
2192
  for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
2099
2193
  vk_buffer& b = ctx->buffer_pool[i];
2100
2194
  if (b == nullptr) {
@@ -2115,6 +2209,8 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
2115
2209
  }
2116
2210
  }
2117
2211
 
2212
+ VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
2213
+
2118
2214
  // Otherwise create new buffer
2119
2215
  vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
2120
2216
  ctx->gc.temp_buffers.push_back(buf);
@@ -2123,9 +2219,7 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
2123
2219
  }
2124
2220
 
2125
2221
  static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
2126
- #ifdef GGML_VULKAN_DEBUG
2127
- std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
2128
- #endif
2222
+ VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
2129
2223
  vk_buffer buf = ggml_vk_create_buffer(ctx, size,
2130
2224
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
2131
2225
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
@@ -2147,9 +2241,7 @@ static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) {
2147
2241
  if (ptr == nullptr) {
2148
2242
  return;
2149
2243
  }
2150
- #ifdef GGML_VULKAN_DEBUG
2151
- std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl;
2152
- #endif
2244
+ VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
2153
2245
  vk_buffer buf;
2154
2246
  size_t index;
2155
2247
  for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
@@ -2201,13 +2293,11 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context
2201
2293
  const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
2202
2294
  const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
2203
2295
  const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
2204
- #ifdef GGML_VULKAN_DEBUG
2205
- std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2296
+ VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2206
2297
  for (auto& buffer : buffers) {
2207
2298
  std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
2208
2299
  }
2209
- std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
2210
- #endif
2300
+ std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
2211
2301
  std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
2212
2302
  std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
2213
2303
  GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
@@ -2240,9 +2330,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
2240
2330
  }
2241
2331
 
2242
2332
  static void ggml_vk_ctx_end(vk_context * ctx) {
2243
- #ifdef GGML_VULKAN_DEBUG
2244
- std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl;
2245
- #endif
2333
+ VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
2246
2334
  if (ctx->s == nullptr) {
2247
2335
  return;
2248
2336
  }
@@ -2252,9 +2340,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
2252
2340
  }
2253
2341
 
2254
2342
  static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) {
2255
- #ifdef GGML_VULKAN_DEBUG
2256
- std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl;
2257
- #endif
2343
+ VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")");
2258
2344
  if (subctx->s != nullptr) {
2259
2345
  ggml_vk_ctx_end(subctx);
2260
2346
  }
@@ -2264,9 +2350,7 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx
2264
2350
  }
2265
2351
 
2266
2352
  static size_t ggml_vk_align_size(size_t width, size_t align) {
2267
- #ifdef GGML_VULKAN_DEBUG
2268
- std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl;
2269
- #endif
2353
+ VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
2270
2354
  return CEIL_DIV(width, align) * align;
2271
2355
  }
2272
2356
 
@@ -2280,6 +2364,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
2280
2364
 
2281
2365
  static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
2282
2366
  if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
2367
+ VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
2283
2368
  ggml_vk_destroy_buffer(ctx->sync_staging);
2284
2369
  ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
2285
2370
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
@@ -2288,9 +2373,7 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si
2288
2373
  }
2289
2374
 
2290
2375
  static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
2291
- #ifdef GGML_VULKAN_DEBUG
2292
- std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
2293
- #endif
2376
+ VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
2294
2377
  GGML_ASSERT(!ggml_is_contiguous(tensor));
2295
2378
  // Buffer is already mapped
2296
2379
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
@@ -2395,9 +2478,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
2395
2478
  }
2396
2479
 
2397
2480
  static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
2398
- #ifdef GGML_VULKAN_DEBUG
2399
- std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
2400
- #endif
2481
+ VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
2401
2482
  // Make sure ctx owns the buffer
2402
2483
  GGML_ASSERT(dst->ctx == ctx);
2403
2484
 
@@ -2432,9 +2513,7 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
2432
2513
  subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
2433
2514
  return;
2434
2515
  }
2435
- #ifdef GGML_VULKAN_DEBUG
2436
- std::cerr << "STAGING" << std::endl;
2437
- #endif
2516
+ VK_LOG_DEBUG("STAGING");
2438
2517
 
2439
2518
  // Staging buffer required
2440
2519
  vk_buffer staging = ctx->staging;
@@ -2469,16 +2548,12 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
2469
2548
  }
2470
2549
 
2471
2550
  static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
2472
- #ifdef GGML_VULKAN_DEBUG
2473
- std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
2474
- #endif
2551
+ VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
2475
2552
  return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging);
2476
2553
  }
2477
2554
 
2478
2555
  static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
2479
- #ifdef GGML_VULKAN_DEBUG
2480
- std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
2481
- #endif
2556
+ VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
2482
2557
  // Buffer is already mapped
2483
2558
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2484
2559
  GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
@@ -2503,16 +2578,12 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
2503
2578
  }
2504
2579
 
2505
2580
  static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) {
2506
- #ifdef GGML_VULKAN_DEBUG
2507
- std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
2508
- #endif
2581
+ VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
2509
2582
  ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1);
2510
2583
  }
2511
2584
 
2512
2585
  static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
2513
- #ifdef GGML_VULKAN_DEBUG
2514
- std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
2515
- #endif
2586
+ VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
2516
2587
  GGML_ASSERT(width > 0);
2517
2588
  GGML_ASSERT(height > 0);
2518
2589
  GGML_ASSERT(src != nullptr);
@@ -2546,9 +2617,7 @@ static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_conte
2546
2617
 
2547
2618
  return;
2548
2619
  }
2549
- #ifdef GGML_VULKAN_DEBUG
2550
- std::cerr << "STAGING" << std::endl;
2551
- #endif
2620
+ VK_LOG_DEBUG("STAGING");
2552
2621
 
2553
2622
  // Fall back to staging buffer
2554
2623
  vk_buffer staging = ctx->staging;
@@ -2575,9 +2644,7 @@ static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context
2575
2644
  }
2576
2645
 
2577
2646
  static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) {
2578
- #ifdef GGML_VULKAN_DEBUG
2579
- std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl;
2580
- #endif
2647
+ VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
2581
2648
  if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2582
2649
  GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
2583
2650
 
@@ -2599,9 +2666,7 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
2599
2666
  }
2600
2667
 
2601
2668
  static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2602
- #ifdef GGML_VULKAN_DEBUG
2603
- std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl;
2604
- #endif
2669
+ VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
2605
2670
  // Make sure both buffers are on same ctx
2606
2671
  GGML_ASSERT(src->ctx == dst->ctx);
2607
2672
 
@@ -2612,9 +2677,7 @@ static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t d
2612
2677
 
2613
2678
  static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2614
2679
  if (src->ctx == dst->ctx) {
2615
- #ifdef GGML_VULKAN_DEBUG
2616
- std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl;
2617
- #endif
2680
+ VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
2618
2681
  // Copy within the device
2619
2682
  ggml_backend_vk_context * ctx = src->ctx;
2620
2683
 
@@ -2626,9 +2689,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2626
2689
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
2627
2690
  ctx->device->device.resetFences({ ctx->fence });
2628
2691
  } else {
2629
- #ifdef GGML_VULKAN_DEBUG
2630
- std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
2631
- #endif
2692
+ VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
2632
2693
  // Copy device to device
2633
2694
  ggml_backend_vk_context * src_ctx = src->ctx;
2634
2695
  ggml_backend_vk_context * dst_ctx = dst->ctx;
@@ -2646,9 +2707,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2646
2707
  }
2647
2708
 
2648
2709
  static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
2649
- #ifdef GGML_VULKAN_DEBUG
2650
- std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
2651
- #endif
2710
+ VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
2652
2711
  // Make sure ctx owns the buffer
2653
2712
  GGML_ASSERT(dst->ctx == ctx);
2654
2713
 
@@ -2663,9 +2722,7 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst,
2663
2722
  }
2664
2723
 
2665
2724
  static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
2666
- #ifdef GGML_VULKAN_DEBUG
2667
- std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
2668
- #endif
2725
+ VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")");
2669
2726
  const uint64_t ne0 = src->ne[0];
2670
2727
  const uint64_t ne1 = src->ne[1];
2671
2728
  const uint64_t nb0 = src->nb[0];
@@ -2693,9 +2750,7 @@ static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
2693
2750
  }
2694
2751
 
2695
2752
  static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) {
2696
- #ifdef GGML_VULKAN_DEBUG
2697
- std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl;
2698
- #endif
2753
+ VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()");
2699
2754
  const uint64_t ne0 = dst->ne[0];
2700
2755
  const uint64_t ne1 = dst->ne[1];
2701
2756
  const uint64_t ne2 = dst->ne[2];
@@ -2719,9 +2774,7 @@ static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
2719
2774
  }
2720
2775
 
2721
2776
  static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
2722
- #ifdef GGML_VULKAN_DEBUG
2723
- std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
2724
- #endif
2777
+ VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
2725
2778
  // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
2726
2779
  // return 4;
2727
2780
  // }
@@ -2753,9 +2806,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context *
2753
2806
  }
2754
2807
 
2755
2808
  static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
2756
- #ifdef GGML_VULKAN_DEBUG
2757
- std::cerr << "ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")" << std::endl;
2758
- #endif
2809
+ VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
2759
2810
  switch (ctx->device->vendor_id) {
2760
2811
  case VK_VENDOR_ID_AMD:
2761
2812
  return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned);
@@ -2777,9 +2828,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2777
2828
  }
2778
2829
 
2779
2830
  static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
2780
- #ifdef GGML_VULKAN_DEBUG
2781
- std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
2782
- #endif
2831
+ VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
2783
2832
  return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
2784
2833
  }
2785
2834
 
@@ -2789,9 +2838,7 @@ static void ggml_vk_matmul(
2789
2838
  uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2790
2839
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2791
2840
  uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
2792
- #ifdef GGML_VULKAN_DEBUG
2793
- std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")" << std::endl;
2794
- #endif
2841
+ VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
2795
2842
  ggml_vk_sync_buffers(subctx);
2796
2843
  if (split_k == 1) {
2797
2844
  const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
@@ -2815,12 +2862,10 @@ static void ggml_vk_matmul_id(
2815
2862
  uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2816
2863
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2817
2864
  uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
2818
- #ifdef GGML_VULKAN_DEBUG
2819
- std::cerr << "ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
2865
+ VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
2820
2866
  "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
2821
2867
  "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
2822
- "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")" << std::endl;
2823
- #endif
2868
+ "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
2824
2869
  ggml_vk_sync_buffers(subctx);
2825
2870
  const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
2826
2871
  nei0, nei1, nbi1, ne11 };
@@ -2850,10 +2895,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2850
2895
  }
2851
2896
 
2852
2897
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2853
- #ifdef GGML_VULKAN_DEBUG
2854
- std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2855
- std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
2856
- #endif
2898
+ VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2899
+ std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
2857
2900
  const int tensor_type_size = ggml_type_size(tensor->type);
2858
2901
 
2859
2902
  const uint32_t ne = ggml_nelements(tensor);
@@ -2870,11 +2913,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
2870
2913
  }
2871
2914
 
2872
2915
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2873
- #ifdef GGML_VULKAN_DEBUG
2874
- std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2916
+ VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2875
2917
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2876
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2877
- #endif
2918
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
2878
2919
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2879
2920
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
2880
2921
 
@@ -2949,7 +2990,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2949
2990
  const uint64_t d_sz = sizeof(float) * d_ne;
2950
2991
 
2951
2992
  vk_buffer d_D = extra->buffer_gpu.lock();
2952
- const uint64_t d_buf_offset = extra->offset;
2993
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
2953
2994
  GGML_ASSERT(d_D != nullptr);
2954
2995
  GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
2955
2996
  vk_buffer d_X;
@@ -2958,12 +2999,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2958
2999
  uint64_t y_buf_offset = 0;
2959
3000
  if (!src0_uma) {
2960
3001
  d_Qx = extra_src0->buffer_gpu.lock();
2961
- qx_buf_offset = extra_src0->offset;
3002
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
2962
3003
  GGML_ASSERT(d_Qx != nullptr);
2963
3004
  }
2964
3005
  if (!src1_uma) {
2965
3006
  d_Qy = extra_src1->buffer_gpu.lock();
2966
- qy_buf_offset = extra_src1->offset;
3007
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
2967
3008
  GGML_ASSERT(d_Qy != nullptr);
2968
3009
  }
2969
3010
  if (qx_needs_dequant) {
@@ -3045,11 +3086,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
3045
3086
  }
3046
3087
 
3047
3088
  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3048
- #ifdef GGML_VULKAN_DEBUG
3049
- std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3089
+ VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3050
3090
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3051
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3052
- #endif
3091
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3053
3092
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3054
3093
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3055
3094
 
@@ -3114,7 +3153,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3114
3153
  const uint64_t d_sz = sizeof(float) * d_ne;
3115
3154
 
3116
3155
  vk_buffer d_D = extra->buffer_gpu.lock();
3117
- const uint64_t d_buf_offset = extra->offset;
3156
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3118
3157
  GGML_ASSERT(d_D != nullptr);
3119
3158
  vk_buffer d_X;
3120
3159
  uint64_t x_buf_offset = 0;
@@ -3122,12 +3161,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3122
3161
  uint64_t y_buf_offset = 0;
3123
3162
  if(!src0_uma) {
3124
3163
  d_Qx = extra_src0->buffer_gpu.lock();
3125
- qx_buf_offset = extra_src0->offset;
3164
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3126
3165
  GGML_ASSERT(d_Qx != nullptr);
3127
3166
  }
3128
3167
  if(!src1_uma) {
3129
3168
  d_Qy = extra_src1->buffer_gpu.lock();
3130
- qy_buf_offset = extra_src1->offset;
3169
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3131
3170
  GGML_ASSERT(d_Qy != nullptr);
3132
3171
  }
3133
3172
  if (qx_needs_dequant) {
@@ -3200,11 +3239,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3200
3239
  }
3201
3240
 
3202
3241
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3203
- #ifdef GGML_VULKAN_DEBUG
3204
- std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3242
+ VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3205
3243
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3206
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3207
- #endif
3244
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3208
3245
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3209
3246
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3210
3247
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
@@ -3246,14 +3283,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3246
3283
  const uint64_t d_sz = sizeof(float) * d_ne;
3247
3284
 
3248
3285
  vk_buffer d_D = extra->buffer_gpu.lock();
3249
- const uint64_t d_buf_offset = extra->offset;
3286
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3250
3287
  GGML_ASSERT(d_D != nullptr);
3251
3288
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3252
- const uint64_t qx_buf_offset = extra_src0->offset;
3289
+ const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3253
3290
  GGML_ASSERT(d_Qx != nullptr);
3254
3291
  if (!src1_uma) {
3255
3292
  d_Qy = extra_src1->buffer_gpu.lock();
3256
- qy_buf_offset = extra_src1->offset;
3293
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3257
3294
  GGML_ASSERT(d_Qx != nullptr);
3258
3295
  }
3259
3296
 
@@ -3273,11 +3310,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3273
3310
  }
3274
3311
 
3275
3312
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3276
- #ifdef GGML_VULKAN_DEBUG
3277
- std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3313
+ VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3278
3314
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3279
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3280
- #endif
3315
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3281
3316
  GGML_ASSERT(!ggml_is_transposed(src0));
3282
3317
  GGML_ASSERT(!ggml_is_transposed(src1));
3283
3318
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -3323,14 +3358,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3323
3358
  const uint64_t d_sz = sizeof(float) * d_ne;
3324
3359
 
3325
3360
  vk_buffer d_D = extra->buffer_gpu.lock();
3326
- const uint64_t d_buf_offset = extra->offset;
3361
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3327
3362
  GGML_ASSERT(d_D != nullptr);
3328
3363
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3329
- const uint64_t qx_buf_offset = extra_src0->offset;
3364
+ const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3330
3365
  GGML_ASSERT(d_Qx != nullptr);
3331
3366
  if (!src1_uma) {
3332
3367
  d_Qy = extra_src1->buffer_gpu.lock();
3333
- qy_buf_offset = extra_src1->offset;
3368
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3334
3369
  GGML_ASSERT(d_Qx != nullptr);
3335
3370
  }
3336
3371
 
@@ -3350,9 +3385,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3350
3385
  }
3351
3386
 
3352
3387
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3353
- #ifdef GGML_VULKAN_DEBUG
3354
- std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
3355
- #endif
3388
+ VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
3356
3389
  if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
3357
3390
  ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
3358
3391
  } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
@@ -3365,12 +3398,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
3365
3398
  }
3366
3399
 
3367
3400
  static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3368
- #ifdef GGML_VULKAN_DEBUG
3369
- std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3401
+ VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3370
3402
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3371
3403
  std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3372
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3373
- #endif
3404
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3374
3405
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3375
3406
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
3376
3407
 
@@ -3459,7 +3490,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3459
3490
  const uint64_t d_sz = sizeof(float) * d_ne;
3460
3491
 
3461
3492
  vk_buffer d_D = extra->buffer_gpu.lock();
3462
- const uint64_t d_buf_offset = extra->offset;
3493
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3463
3494
  GGML_ASSERT(d_D != nullptr);
3464
3495
  vk_buffer d_X;
3465
3496
  uint64_t x_buf_offset = 0;
@@ -3467,17 +3498,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3467
3498
  uint64_t y_buf_offset = 0;
3468
3499
  if (!src0_uma) {
3469
3500
  d_Qx = extra_src0->buffer_gpu.lock();
3470
- qx_buf_offset = extra_src0->offset;
3501
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3471
3502
  GGML_ASSERT(d_Qx != nullptr);
3472
3503
  }
3473
3504
  if (!src1_uma) {
3474
3505
  d_Qy = extra_src1->buffer_gpu.lock();
3475
- qy_buf_offset = extra_src1->offset;
3506
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3476
3507
  GGML_ASSERT(d_Qy != nullptr);
3477
3508
  }
3478
3509
  if (!ids_uma) {
3479
3510
  d_ids = extra_ids->buffer_gpu.lock();
3480
- ids_buf_offset = extra_ids->offset;
3511
+ ids_buf_offset = extra_ids->offset + ids->view_offs;
3481
3512
  GGML_ASSERT(d_ids != nullptr);
3482
3513
  }
3483
3514
  if (qx_needs_dequant) {
@@ -3556,12 +3587,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3556
3587
  }
3557
3588
 
3558
3589
  static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3559
- #ifdef GGML_VULKAN_DEBUG
3560
- std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3590
+ VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3561
3591
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3562
3592
  std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3563
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3564
- #endif
3593
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3565
3594
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3566
3595
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3567
3596
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -3636,7 +3665,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3636
3665
  const uint64_t d_sz = sizeof(float) * d_ne;
3637
3666
 
3638
3667
  vk_buffer d_D = extra->buffer_gpu.lock();
3639
- const uint64_t d_buf_offset = extra->offset;
3668
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3640
3669
  GGML_ASSERT(d_D != nullptr);
3641
3670
  vk_buffer d_X;
3642
3671
  uint64_t x_buf_offset = 0;
@@ -3644,17 +3673,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3644
3673
  uint64_t y_buf_offset = 0;
3645
3674
  if(!src0_uma) {
3646
3675
  d_Qx = extra_src0->buffer_gpu.lock();
3647
- qx_buf_offset = extra_src0->offset;
3676
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3648
3677
  GGML_ASSERT(d_Qx != nullptr);
3649
3678
  }
3650
3679
  if(!src1_uma) {
3651
3680
  d_Qy = extra_src1->buffer_gpu.lock();
3652
- qy_buf_offset = extra_src1->offset;
3681
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3653
3682
  GGML_ASSERT(d_Qy != nullptr);
3654
3683
  }
3655
3684
  if(!ids_uma) {
3656
3685
  d_ids = extra_ids->buffer_gpu.lock();
3657
- ids_buf_offset = extra_ids->offset;
3686
+ ids_buf_offset = extra_ids->offset + ids->view_offs;
3658
3687
  GGML_ASSERT(d_ids != nullptr);
3659
3688
  }
3660
3689
  if (qx_needs_dequant) {
@@ -3724,9 +3753,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3724
3753
  }
3725
3754
 
3726
3755
  static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
3727
- #ifdef GGML_VULKAN_DEBUG
3728
- std::cerr << "ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")" << std::endl;
3729
- #endif
3756
+ VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
3730
3757
  if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
3731
3758
  ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
3732
3759
  } else {
@@ -3769,9 +3796,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3769
3796
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3770
3797
 
3771
3798
  const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
3772
- const uint64_t src_offset = extra_src0->offset;
3799
+ const uint64_t src_offset = extra_src0->offset + src0->view_offs;
3773
3800
  vk_buffer dst_buf = extra->buffer_gpu.lock();
3774
- const uint64_t dst_offset = extra->offset;
3801
+ const uint64_t dst_offset = extra->offset + dst->view_offs;
3775
3802
 
3776
3803
  std::vector<vk::BufferCopy> copies;
3777
3804
 
@@ -3908,10 +3935,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3908
3935
  }
3909
3936
  } else {
3910
3937
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3911
- return ctx->device->pipeline_rope_f32;
3938
+ return ctx->device->pipeline_rope_norm_f32;
3912
3939
  }
3913
3940
  if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3914
- return ctx->device->pipeline_rope_f16;
3941
+ return ctx->device->pipeline_rope_norm_f16;
3915
3942
  }
3916
3943
  }
3917
3944
  return nullptr;
@@ -3960,16 +3987,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
3960
3987
 
3961
3988
  template<typename PC>
3962
3989
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3963
- #ifdef GGML_VULKAN_DEBUG
3964
- std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3990
+ VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3965
3991
  if (src1 != nullptr) {
3966
3992
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3967
3993
  }
3968
3994
  if (src2 != nullptr) {
3969
3995
  std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3970
3996
  }
3971
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3972
- #endif
3997
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")");
3973
3998
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3974
3999
  GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
3975
4000
  GGML_ASSERT(dst->extra != nullptr);
@@ -4062,21 +4087,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4062
4087
  }
4063
4088
 
4064
4089
  GGML_ASSERT(d_D != nullptr);
4065
- uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4090
+ uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4066
4091
  GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
4067
4092
  if(!src0_uma) {
4068
4093
  d_X = extra_src0->buffer_gpu.lock();
4069
- x_buf_offset = extra_src0->offset;
4094
+ x_buf_offset = extra_src0->offset + src0->view_offs;
4070
4095
  GGML_ASSERT(d_X != nullptr);
4071
4096
  }
4072
4097
  if (use_src1 && !src1_uma) {
4073
4098
  d_Y = extra_src1->buffer_gpu.lock();
4074
- y_buf_offset = extra_src1->offset;
4099
+ y_buf_offset = extra_src1->offset + src1->view_offs;
4075
4100
  GGML_ASSERT(d_Y != nullptr);
4076
4101
  }
4077
4102
  if (use_src2 && !src2_uma) {
4078
4103
  d_Z = extra_src2->buffer_gpu.lock();
4079
- z_buf_offset = extra_src2->offset;
4104
+ z_buf_offset = extra_src2->offset + src2->view_offs;
4080
4105
  GGML_ASSERT(d_Z != nullptr);
4081
4106
  }
4082
4107
 
@@ -4155,24 +4180,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4155
4180
  ggml_vk_sync_buffers(subctx);
4156
4181
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4157
4182
  } else if (op == GGML_OP_ROPE) {
4158
- const int mode = ((int32_t *) dst->op_params)[2];
4159
- const bool is_neox = mode & 2;
4160
-
4161
- if (is_neox) {
4162
- // Empty src2 is possible in rope, but the shader needs a buffer
4163
- vk_subbuffer subbuf_z;
4164
- if (use_src2) {
4165
- subbuf_z = { d_Z, z_buf_offset, z_sz };
4166
- } else {
4167
- subbuf_z = { d_X, 0, d_X->size };
4168
- }
4169
-
4170
- ggml_vk_sync_buffers(subctx);
4171
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4183
+ // Empty src2 is possible in rope, but the shader needs a buffer
4184
+ vk_subbuffer subbuf_z;
4185
+ if (use_src2) {
4186
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
4172
4187
  } else {
4173
- ggml_vk_sync_buffers(subctx);
4174
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4188
+ subbuf_z = { d_X, 0, d_X->size };
4175
4189
  }
4190
+
4191
+ ggml_vk_sync_buffers(subctx);
4192
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4176
4193
  } else if (use_src2) {
4177
4194
  ggml_vk_sync_buffers(subctx);
4178
4195
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4336,7 +4353,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4336
4353
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4337
4354
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4338
4355
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4339
- const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4356
+ const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4340
4357
 
4341
4358
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4342
4359
  (uint32_t)ggml_nelements(src0),
@@ -4394,7 +4411,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4394
4411
 
4395
4412
  static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4396
4413
  const int n_dims = ((int32_t *) dst->op_params)[1];
4397
- const int mode = ((int32_t *) dst->op_params)[2];
4414
+ // const int mode = ((int32_t *) dst->op_params)[2];
4398
4415
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
4399
4416
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
4400
4417
  const float freq_base = ((float *) dst->op_params)[5];
@@ -4404,28 +4421,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4404
4421
  const float beta_fast = ((float *) dst->op_params)[9];
4405
4422
  const float beta_slow = ((float *) dst->op_params)[10];
4406
4423
 
4407
- const bool is_neox = mode & 2;
4408
-
4409
- #pragma message("TODO: update rope NORM mode to match NEOX mode")
4410
- #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
4411
-
4412
4424
  float corr_dims[2];
4413
4425
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
4414
4426
 
4415
- if (is_neox) {
4416
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
4417
- const float inv_ndims = -1.0f / n_dims;
4418
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4419
- (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4420
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
4421
- src2 != nullptr,
4422
- });
4423
- } else {
4424
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4425
- (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4426
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4427
- });
4428
- }
4427
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
4428
+
4429
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4430
+ (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4431
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
4432
+ src2 != nullptr,
4433
+ });
4429
4434
  }
4430
4435
 
4431
4436
  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -4487,9 +4492,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
4487
4492
 
4488
4493
  template <typename X_TYPE, typename Y_TYPE>
4489
4494
  static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
4490
- #ifdef GGML_VULKAN_DEBUG
4491
- std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl;
4492
- #endif
4495
+ VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")");
4493
4496
  const size_t x_ne = m * k * batch;
4494
4497
  const size_t y_ne = k * n * batch;
4495
4498
  const size_t d_ne = m * n * batch;
@@ -4903,9 +4906,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
4903
4906
  }
4904
4907
 
4905
4908
  static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
4906
- #ifdef GGML_VULKAN_DEBUG
4907
- std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
4908
- #endif
4909
+ VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
4909
4910
  // Check transfers are correct
4910
4911
  vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4911
4912
 
@@ -4989,9 +4990,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
4989
4990
  }
4990
4991
 
4991
4992
  static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
4992
- #ifdef GGML_VULKAN_DEBUG
4993
- std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
4994
- #endif
4993
+ VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
4995
4994
  const size_t x_sz = sizeof(float) * ne;
4996
4995
  const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
4997
4996
  const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
@@ -5068,9 +5067,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
5068
5067
  }
5069
5068
 
5070
5069
  static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
5071
- #ifdef GGML_VULKAN_DEBUG
5072
- std::cerr << "ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")" << std::endl;
5073
- #endif
5070
+ VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
5074
5071
  const size_t x_ne = m * k * batch;
5075
5072
  const size_t y_ne = k * n * batch;
5076
5073
  const size_t d_ne = m * n * batch;
@@ -5254,9 +5251,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5254
5251
  #endif
5255
5252
 
5256
5253
  static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
5257
- #ifdef GGML_VULKAN_DEBUG
5258
- std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
5259
- #endif
5254
+ VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
5260
5255
  ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
5261
5256
  extra->reset();
5262
5257
  tensor->extra = extra;
@@ -5264,9 +5259,7 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
5264
5259
  }
5265
5260
 
5266
5261
  static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
5267
- #ifdef GGML_VULKAN_DEBUG
5268
- std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
5269
- #endif
5262
+ VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
5270
5263
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5271
5264
 
5272
5265
  if (extra == nullptr) {
@@ -5301,7 +5294,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5301
5294
 
5302
5295
  bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
5303
5296
 
5304
- const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
5297
+ const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
5305
5298
  const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
5306
5299
 
5307
5300
  int split_k;
@@ -5379,9 +5372,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5379
5372
  }
5380
5373
 
5381
5374
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5382
- #ifdef GGML_VULKAN_DEBUG
5383
- std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
5384
- #endif
5385
5375
  #if defined(GGML_VULKAN_RUN_TESTS)
5386
5376
  ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
5387
5377
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
@@ -5520,6 +5510,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5520
5510
  #endif
5521
5511
 
5522
5512
  if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
5513
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
5523
5514
  // Resize buffer
5524
5515
  if (ctx->prealloc_x != nullptr) {
5525
5516
  ggml_vk_destroy_buffer(ctx->prealloc_x);
@@ -5527,6 +5518,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5527
5518
  ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x);
5528
5519
  }
5529
5520
  if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
5521
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")");
5530
5522
  // Resize buffer
5531
5523
  if (ctx->prealloc_y != nullptr) {
5532
5524
  ggml_vk_destroy_buffer(ctx->prealloc_y);
@@ -5534,6 +5526,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5534
5526
  ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y);
5535
5527
  }
5536
5528
  if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
5529
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
5537
5530
  // Resize buffer
5538
5531
  if (ctx->prealloc_split_k != nullptr) {
5539
5532
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
@@ -5541,6 +5534,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5541
5534
  ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k);
5542
5535
  }
5543
5536
  if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
5537
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
5544
5538
  // Resize buffer
5545
5539
  if (ctx->staging != nullptr) {
5546
5540
  ggml_vk_destroy_buffer(ctx->staging);
@@ -5558,9 +5552,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5558
5552
  return;
5559
5553
  }
5560
5554
 
5561
- #ifdef GGML_VULKAN_DEBUG
5562
- std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl;
5563
- #endif
5555
+ VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
5564
5556
  ctx->semaphore_idx = 0;
5565
5557
  ctx->staging_offset = 0;
5566
5558
 
@@ -5569,6 +5561,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5569
5561
  const ggml_tensor * src2 = node->src[2];
5570
5562
 
5571
5563
  switch (node->op) {
5564
+ // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
5565
+ case GGML_OP_RESHAPE:
5566
+ case GGML_OP_VIEW:
5567
+ case GGML_OP_PERMUTE:
5568
+ case GGML_OP_TRANSPOSE:
5569
+ case GGML_OP_NONE:
5570
+ return;
5572
5571
  case GGML_OP_UNARY:
5573
5572
  switch (ggml_get_unary_op(node)) {
5574
5573
  case GGML_UNARY_OP_SILU:
@@ -5590,10 +5589,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5590
5589
  case GGML_OP_CPY:
5591
5590
  case GGML_OP_CONT:
5592
5591
  case GGML_OP_DUP:
5593
- case GGML_OP_RESHAPE:
5594
- case GGML_OP_VIEW:
5595
- case GGML_OP_PERMUTE:
5596
- case GGML_OP_TRANSPOSE:
5597
5592
  case GGML_OP_NORM:
5598
5593
  case GGML_OP_RMS_NORM:
5599
5594
  case GGML_OP_DIAG_MASK_INF:
@@ -5601,7 +5596,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5601
5596
  case GGML_OP_ROPE:
5602
5597
  case GGML_OP_MUL_MAT:
5603
5598
  case GGML_OP_MUL_MAT_ID:
5604
- case GGML_OP_NONE:
5605
5599
  case GGML_OP_ARGSORT:
5606
5600
  case GGML_OP_SUM_ROWS:
5607
5601
  break;
@@ -5654,12 +5648,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5654
5648
  case GGML_OP_DUP:
5655
5649
  ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
5656
5650
 
5657
- break;
5658
- case GGML_OP_RESHAPE:
5659
- case GGML_OP_VIEW:
5660
- case GGML_OP_PERMUTE:
5661
- case GGML_OP_TRANSPOSE:
5662
- case GGML_OP_NONE:
5663
5651
  break;
5664
5652
  case GGML_OP_NORM:
5665
5653
  ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@@ -5712,7 +5700,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5712
5700
  return;
5713
5701
  }
5714
5702
 
5715
- extra->ready = true;
5716
5703
  extra->ctx_idx = ctx->compute_ctx->idx;
5717
5704
 
5718
5705
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5788,16 +5775,12 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5788
5775
  return true;
5789
5776
  }
5790
5777
 
5791
- #ifdef GGML_VULKAN_DEBUG
5792
- std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5793
- #endif
5778
+ VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
5794
5779
 
5795
5780
  #ifdef GGML_VULKAN_CHECK_RESULTS
5796
5781
  ggml_vk_check_results_0(ctx, params, tensor);
5797
5782
  #endif
5798
5783
 
5799
- GGML_ASSERT(extra->ready);
5800
-
5801
5784
  vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
5802
5785
 
5803
5786
  // Only run if ctx hasn't been submitted yet
@@ -5822,16 +5805,12 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5822
5805
  subctx.out_memcpys.clear();
5823
5806
  }
5824
5807
 
5825
- extra->ready = false;
5826
-
5827
5808
  return true;
5828
5809
  }
5829
5810
 
5830
5811
  // Clean up after graph processing is done
5831
5812
  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5832
- #ifdef GGML_VULKAN_DEBUG
5833
- std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
5834
- #endif
5813
+ VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
5835
5814
  for (auto& buffer : ctx->gc.temp_buffers) {
5836
5815
  ggml_vk_pool_free(ctx, buffer);
5837
5816
  }
@@ -5875,9 +5854,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5875
5854
 
5876
5855
  // Clean up on backend free
5877
5856
  static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5878
- #ifdef GGML_VULKAN_DEBUG
5879
- std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl;
5880
- #endif
5857
+ VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")");
5881
5858
  ggml_vk_graph_cleanup(ctx);
5882
5859
 
5883
5860
  ggml_vk_destroy_buffer(ctx->prealloc_x);
@@ -5943,7 +5920,9 @@ struct ggml_backend_vk_buffer_context {
5943
5920
 
5944
5921
  ~ggml_backend_vk_buffer_context() {
5945
5922
  ggml_vk_destroy_buffer(dev_buffer);
5946
- delete[] temp_tensor_extras;
5923
+ if (temp_tensor_extras != nullptr) {
5924
+ delete[] temp_tensor_extras;
5925
+ }
5947
5926
  }
5948
5927
 
5949
5928
  ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
@@ -5970,9 +5949,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
5970
5949
  }
5971
5950
 
5972
5951
  GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
5973
- #ifdef GGML_VULKAN_DEBUG
5974
- std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl;
5975
- #endif
5952
+ VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
5976
5953
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5977
5954
  ggml_vk_destroy_buffer(ctx->dev_buffer);
5978
5955
  delete ctx;
@@ -5985,49 +5962,41 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu
5985
5962
  }
5986
5963
 
5987
5964
  GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
5988
- #ifdef GGML_VULKAN_DEBUG
5989
- std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl;
5990
- #endif
5965
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
5991
5966
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5992
5967
 
5993
- ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
5994
- if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
5968
+ if (tensor->view_src != nullptr) {
5995
5969
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
5996
- ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra;
5997
- extra->buffer_gpu = extra_view->buffer_gpu;
5998
- extra->offset = extra_view->offset + tensor->view_offs;
5970
+ GGML_ASSERT(tensor->view_src->extra != nullptr);
5971
+ tensor->extra = tensor->view_src->extra;
5999
5972
  } else {
5973
+ ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
6000
5974
  extra->buffer_gpu = ctx->dev_buffer;
6001
5975
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5976
+ tensor->extra = extra;
6002
5977
  }
6003
-
6004
- tensor->extra = extra;
6005
5978
  }
6006
5979
 
6007
5980
  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6008
- #ifdef GGML_VULKAN_DEBUG
6009
- std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
6010
- #endif
5981
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6011
5982
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6012
5983
 
6013
5984
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6014
5985
 
6015
5986
  vk_buffer buf = extra->buffer_gpu.lock();
6016
5987
 
6017
- ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
5988
+ ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6018
5989
  }
6019
5990
 
6020
5991
  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6021
- #ifdef GGML_VULKAN_DEBUG
6022
- std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
6023
- #endif
5992
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6024
5993
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6025
5994
 
6026
5995
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6027
5996
 
6028
5997
  vk_buffer buf = extra->buffer_gpu.lock();
6029
5998
 
6030
- ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
5999
+ ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6031
6000
  }
6032
6001
 
6033
6002
  GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6038,7 +6007,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
6038
6007
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6039
6008
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6040
6009
 
6041
- ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
6010
+ ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6042
6011
 
6043
6012
  return true;
6044
6013
  }
@@ -6078,11 +6047,15 @@ GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buff
6078
6047
  }
6079
6048
 
6080
6049
  GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6081
- #ifdef GGML_VULKAN_DEBUG
6082
- std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6083
- #endif
6050
+ VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
6084
6051
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6085
- vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
6052
+
6053
+ vk_buffer dev_buffer = nullptr;
6054
+ try {
6055
+ dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
6056
+ } catch (const vk::SystemError& e) {
6057
+ return nullptr;
6058
+ }
6086
6059
 
6087
6060
  ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
6088
6061
 
@@ -6105,33 +6078,19 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
6105
6078
  UNUSED(buft);
6106
6079
  }
6107
6080
 
6108
- GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
6109
- if (!ggml_backend_is_vk(backend)) {
6110
- return false;
6111
- }
6112
-
6113
- ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6114
- ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6115
-
6116
- return buft_ctx->ctx->idx == ctx->idx;
6117
- }
6118
-
6119
6081
  static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6120
6082
  /* .get_name = */ ggml_backend_vk_buffer_type_name,
6121
6083
  /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
6122
6084
  /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
6123
6085
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
6124
6086
  /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
6125
- /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
6126
6087
  /* .is_host = */ NULL,
6127
6088
  };
6128
6089
 
6129
6090
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6130
6091
  ggml_vk_instance_init();
6131
6092
 
6132
- #ifdef GGML_VULKAN_DEBUG
6133
- std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
6134
- #endif
6093
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
6135
6094
 
6136
6095
  GGML_ASSERT(dev_num < vk_instance.device_indices.size());
6137
6096
 
@@ -6155,16 +6114,12 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff
6155
6114
  }
6156
6115
 
6157
6116
  GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6158
- #ifdef GGML_VULKAN_DEBUG
6159
- std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl;
6160
- #endif
6117
+ VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
6161
6118
  ggml_vk_host_free(&vk_instance.contexts[0], buffer->context);
6162
6119
  }
6163
6120
 
6164
6121
  GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6165
- #ifdef GGML_VULKAN_DEBUG
6166
- std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6167
- #endif
6122
+ VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
6168
6123
  size += 32; // Behave like the CPU buffer type
6169
6124
  void * ptr = nullptr;
6170
6125
  try {
@@ -6198,7 +6153,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6198
6153
  /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
6199
6154
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
6200
6155
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
6201
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
6202
6156
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
6203
6157
  },
6204
6158
  /* .context = */ nullptr,
@@ -6222,9 +6176,7 @@ GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
6222
6176
 
6223
6177
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
6224
6178
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6225
- #ifdef GGML_VULKAN_DEBUG
6226
- std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl;
6227
- #endif
6179
+ VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
6228
6180
 
6229
6181
  size_t idx = ctx->idx;
6230
6182
 
@@ -6248,9 +6200,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t
6248
6200
  }
6249
6201
 
6250
6202
  GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6251
- #ifdef GGML_VULKAN_DEBUG
6252
- std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
6253
- #endif
6203
+ VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
6254
6204
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6255
6205
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6256
6206
 
@@ -6264,13 +6214,11 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6264
6214
 
6265
6215
  vk_buffer buf = extra->buffer_gpu.lock();
6266
6216
 
6267
- ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
6217
+ ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6268
6218
  }
6269
6219
 
6270
6220
  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6271
- #ifdef GGML_VULKAN_DEBUG
6272
- std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
6273
- #endif
6221
+ VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
6274
6222
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6275
6223
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6276
6224
 
@@ -6284,13 +6232,11 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6284
6232
 
6285
6233
  vk_buffer buf = extra->buffer_gpu.lock();
6286
6234
 
6287
- ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
6235
+ ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6288
6236
  }
6289
6237
 
6290
6238
  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6291
- #ifdef GGML_VULKAN_DEBUG
6292
- std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
6293
- #endif
6239
+ VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
6294
6240
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6295
6241
  if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
6296
6242
  ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
@@ -6305,7 +6251,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6305
6251
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6306
6252
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6307
6253
 
6308
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
6254
+ ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6309
6255
  return true;
6310
6256
  }
6311
6257
 
@@ -6313,9 +6259,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6313
6259
  }
6314
6260
 
6315
6261
  GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6316
- #ifdef GGML_VULKAN_DEBUG
6317
- std::cerr << "ggml_backend_vk_synchronize()" << std::endl;
6318
- #endif
6262
+ VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
6319
6263
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6320
6264
  if(ctx->transfer_ctx == nullptr) {
6321
6265
  return;
@@ -6343,9 +6287,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
6343
6287
  }
6344
6288
 
6345
6289
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6346
- #ifdef GGML_VULKAN_DEBUG
6347
- std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
6348
- #endif
6290
+ VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
6349
6291
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6350
6292
 
6351
6293
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -6402,7 +6344,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6402
6344
  case GGML_UNARY_OP_GELU:
6403
6345
  case GGML_UNARY_OP_SILU:
6404
6346
  case GGML_UNARY_OP_RELU:
6405
- return true;
6347
+ return ggml_is_contiguous(op->src[0]);
6406
6348
  default:
6407
6349
  return false;
6408
6350
  }
@@ -6478,11 +6420,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6478
6420
  // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
6479
6421
  // } break;
6480
6422
  case GGML_OP_ROPE:
6481
- {
6482
- const int mode = ((const int32_t *) op->op_params)[2];
6483
-
6484
- return true;
6485
- } break;
6423
+ return ggml_is_contiguous(op->src[0]);
6486
6424
  case GGML_OP_NONE:
6487
6425
  case GGML_OP_RESHAPE:
6488
6426
  case GGML_OP_VIEW:
@@ -6518,6 +6456,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
6518
6456
  UNUSED(backend);
6519
6457
  }
6520
6458
 
6459
+ GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
6460
+ if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
6461
+ return false;
6462
+ }
6463
+
6464
+ ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6465
+ ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6466
+
6467
+ return buft_ctx->ctx->idx == ctx->idx;
6468
+ }
6469
+
6521
6470
  // TODO: enable async and synchronize
6522
6471
  static ggml_backend_i ggml_backend_vk_interface = {
6523
6472
  /* .get_name = */ ggml_backend_vk_name,
@@ -6529,9 +6478,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
6529
6478
  /* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
6530
6479
  /* .graph_plan_create = */ NULL,
6531
6480
  /* .graph_plan_free = */ NULL,
6481
+ /* .graph_plan_update = */ NULL,
6532
6482
  /* .graph_plan_compute = */ NULL,
6533
6483
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
6534
6484
  /* .supports_op = */ ggml_backend_vk_supports_op,
6485
+ /* .supports_buft = */ ggml_backend_vk_supports_buft,
6535
6486
  /* .offload_op = */ ggml_backend_vk_offload_op,
6536
6487
  /* .event_new = */ NULL,
6537
6488
  /* .event_free = */ NULL,
@@ -6549,9 +6500,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6549
6500
  if (vk_instance.initialized[dev_num]) {
6550
6501
  return vk_instance.backends[dev_num];
6551
6502
  }
6552
- #ifdef GGML_VULKAN_DEBUG
6553
- std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
6554
- #endif
6503
+ VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
6555
6504
 
6556
6505
  ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
6557
6506
  ggml_vk_init(ctx, dev_num);
@@ -6725,7 +6674,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6725
6674
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6726
6675
 
6727
6676
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6728
- ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
6677
+ ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
6729
6678
  }
6730
6679
 
6731
6680
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@@ -6767,9 +6716,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6767
6716
  return;
6768
6717
  }
6769
6718
 
6770
- #ifdef GGML_VULKAN_DEBUG
6771
- std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
6772
- #endif
6719
+ VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
6773
6720
 
6774
6721
  ggml_tensor * src0 = tensor->src[0];
6775
6722
  ggml_tensor * src1 = tensor->src[1];
@@ -6809,7 +6756,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6809
6756
  } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6810
6757
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6811
6758
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6812
- uint64_t offset = extra->offset;
6759
+ uint64_t offset = extra->offset + src0->view_offs;
6813
6760
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
6814
6761
  for (int i3 = 0; i3 < src0->ne[3]; i3++) {
6815
6762
  for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@@ -6851,7 +6798,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6851
6798
  } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6852
6799
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6853
6800
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6854
- uint64_t offset = extra->offset;
6801
+ uint64_t offset = extra->offset + src1->view_offs;
6855
6802
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
6856
6803
  for (int i3 = 0; i3 < src1->ne[3]; i3++) {
6857
6804
  for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@@ -6909,7 +6856,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6909
6856
  } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6910
6857
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6911
6858
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6912
- uint64_t offset = extra->offset;
6859
+ uint64_t offset = extra->offset + src2->view_offs;
6913
6860
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6914
6861
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6915
6862
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@@ -7075,9 +7022,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7075
7022
  return;
7076
7023
  }
7077
7024
 
7078
- #ifdef GGML_VULKAN_DEBUG
7079
- std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
7080
- #endif
7025
+ VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")");
7081
7026
 
7082
7027
  ggml_tensor * src0 = tensor->src[0];
7083
7028
  ggml_tensor * src1 = tensor->src[1];
@@ -7092,11 +7037,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7092
7037
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7093
7038
 
7094
7039
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7095
- if (extra->offset + tensor_size >= buffer_gpu->size) {
7096
- tensor_size = buffer_gpu->size - (extra->offset);
7040
+ if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
7041
+ tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
7097
7042
  }
7098
7043
 
7099
- ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
7044
+ ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
7100
7045
  }
7101
7046
 
7102
7047
  float first_error_result = -1.0f;