llama_cpp 0.16.0 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -1,5 +1,5 @@
1
1
  #include "ggml-vulkan.h"
2
-
2
+ #include <vulkan/vulkan_core.h>
3
3
  #ifdef GGML_VULKAN_RUN_TESTS
4
4
  #include <chrono>
5
5
  #endif
@@ -9,12 +9,13 @@
9
9
  #include <algorithm>
10
10
  #include <cmath>
11
11
  #include <iostream>
12
- #include <limits>
13
12
  #include <tuple>
14
13
  #include <vector>
15
14
  #include <sstream>
16
15
  #include <utility>
17
16
  #include <memory>
17
+ #include <limits>
18
+ #include <map>
18
19
 
19
20
  #include "ggml.h"
20
21
  #include "ggml-backend-impl.h"
@@ -150,7 +151,7 @@ struct vk_device {
150
151
  vk_pipeline pipeline_relu_f32;
151
152
  vk_pipeline pipeline_diag_mask_inf_f32;
152
153
  vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
153
- vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
154
+ vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
154
155
  vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
155
156
  vk_pipeline pipeline_argsort_f32;
156
157
  vk_pipeline pipeline_sum_rows_f32;
@@ -283,26 +284,15 @@ struct vk_op_diag_mask_push_constants {
283
284
 
284
285
  struct vk_op_rope_push_constants {
285
286
  uint32_t ncols;
287
+ uint32_t n_dims;
286
288
  float freq_scale;
287
289
  uint32_t p_delta_rows;
288
290
  float freq_base;
289
291
  float ext_factor;
290
292
  float attn_factor;
291
- float corr_dims[4];
292
- };
293
-
294
- struct vk_op_rope_neox_push_constants {
295
- uint32_t ncols;
296
- uint32_t ndims;
297
- float freq_scale;
298
- uint32_t p_delta_rows;
299
- float freq_base;
300
- float ext_factor;
301
- float attn_factor;
302
- float corr_dims[4];
293
+ float corr_dims[2];
303
294
  float theta_scale;
304
- float inv_ndims;
305
- uint32_t has_freq_facs;
295
+ uint32_t has_ff;
306
296
  };
307
297
 
308
298
  struct vk_op_soft_max_push_constants {
@@ -345,15 +335,12 @@ struct vk_context {
345
335
  };
346
336
 
347
337
  struct ggml_tensor_extra_gpu {
348
- bool ready;
349
-
350
338
  size_t ctx_idx;
351
339
 
352
340
  vk_buffer_ref buffer_gpu;
353
341
  uint64_t offset;
354
342
 
355
343
  void reset() {
356
- ready = false;
357
344
  ctx_idx = 0;
358
345
  buffer_gpu.reset();
359
346
  offset = 0;
@@ -1537,11 +1524,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1537
1524
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1538
1525
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1539
1526
 
1540
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1541
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1528
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1542
1529
 
1543
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1544
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1530
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1531
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1545
1532
 
1546
1533
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1547
1534
 
@@ -1569,8 +1556,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1569
1556
  vk::PhysicalDeviceProperties2 props2;
1570
1557
  vk::PhysicalDeviceMaintenance3Properties props3;
1571
1558
  vk::PhysicalDeviceSubgroupProperties subgroup_props;
1559
+ vk::PhysicalDeviceDriverProperties driver_props;
1572
1560
  props2.pNext = &props3;
1573
1561
  props3.pNext = &subgroup_props;
1562
+ subgroup_props.pNext = &driver_props;
1574
1563
  physical_device.getProperties2(&props2);
1575
1564
 
1576
1565
  const size_t subgroup_size = subgroup_props.subgroupSize;
@@ -1614,7 +1603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1614
1603
  fp16 = fp16 && vk12_features.shaderFloat16;
1615
1604
 
1616
1605
  std::string device_name = props2.properties.deviceName.data();
1617
- std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1606
+ std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1618
1607
 
1619
1608
  if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
1620
1609
  std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
@@ -1710,7 +1699,78 @@ void ggml_vk_instance_init() {
1710
1699
  vk::PhysicalDeviceProperties props = devices[i].getProperties();
1711
1700
 
1712
1701
  if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1713
- vk_instance.device_indices.push_back(i);
1702
+ // Check if there are two physical devices corresponding to the same GPU
1703
+ auto old_device = std::find_if(
1704
+ vk_instance.device_indices.begin(),
1705
+ vk_instance.device_indices.end(),
1706
+ [&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
1707
+ );
1708
+ if (old_device == vk_instance.device_indices.end()) {
1709
+ vk_instance.device_indices.push_back(i);
1710
+ } else {
1711
+ // There can be two physical devices corresponding to the same GPU if there are 2 different drivers
1712
+ // This can cause error when splitting layers aross the devices, need to keep only 1
1713
+ #ifdef GGML_VULKAN_DEBUG
1714
+ std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
1715
+ #endif
1716
+
1717
+ vk::PhysicalDeviceProperties2 old_prop;
1718
+ vk::PhysicalDeviceDriverProperties old_driver;
1719
+ old_prop.pNext = &old_driver;
1720
+ devices[*old_device].getProperties2(&old_prop);
1721
+
1722
+ vk::PhysicalDeviceProperties2 new_prop;
1723
+ vk::PhysicalDeviceDriverProperties new_driver;
1724
+ new_prop.pNext = &new_driver;
1725
+ devices[i].getProperties2(&new_prop);
1726
+
1727
+ std::map<vk::DriverId, int> driver_priorities {};
1728
+ int old_priority = std::numeric_limits<int>::max();
1729
+ int new_priority = std::numeric_limits<int>::max();
1730
+
1731
+ // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
1732
+ // Smaller number -> higher priority
1733
+ switch (old_prop.properties.vendorID) {
1734
+ case VK_VENDOR_ID_AMD:
1735
+ driver_priorities[vk::DriverId::eMesaRadv] = 1;
1736
+ driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
1737
+ driver_priorities[vk::DriverId::eAmdProprietary] = 3;
1738
+ break;
1739
+ case VK_VENDOR_ID_INTEL:
1740
+ driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
1741
+ driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
1742
+ break;
1743
+ case VK_VENDOR_ID_NVIDIA:
1744
+ driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
1745
+ #if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
1746
+ driver_priorities[vk::DriverId::eMesaNvk] = 2;
1747
+ #endif
1748
+ break;
1749
+ }
1750
+
1751
+ if (driver_priorities.count(old_driver.driverID)) {
1752
+ old_priority = driver_priorities[old_driver.driverID];
1753
+ }
1754
+ if (driver_priorities.count(new_driver.driverID)) {
1755
+ new_priority = driver_priorities[new_driver.driverID];
1756
+ }
1757
+
1758
+ if (new_priority < old_priority) {
1759
+ auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
1760
+ vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
1761
+ vk_instance.device_indices.push_back(i);
1762
+
1763
+ #ifdef GGML_VULKAN_DEBUG
1764
+ std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
1765
+ #endif
1766
+ }
1767
+ #ifdef GGML_VULKAN_DEBUG
1768
+ else {
1769
+ std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
1770
+
1771
+ }
1772
+ #endif
1773
+ }
1714
1774
  }
1715
1775
  }
1716
1776
 
@@ -2949,7 +3009,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2949
3009
  const uint64_t d_sz = sizeof(float) * d_ne;
2950
3010
 
2951
3011
  vk_buffer d_D = extra->buffer_gpu.lock();
2952
- const uint64_t d_buf_offset = extra->offset;
3012
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
2953
3013
  GGML_ASSERT(d_D != nullptr);
2954
3014
  GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
2955
3015
  vk_buffer d_X;
@@ -2958,12 +3018,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2958
3018
  uint64_t y_buf_offset = 0;
2959
3019
  if (!src0_uma) {
2960
3020
  d_Qx = extra_src0->buffer_gpu.lock();
2961
- qx_buf_offset = extra_src0->offset;
3021
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
2962
3022
  GGML_ASSERT(d_Qx != nullptr);
2963
3023
  }
2964
3024
  if (!src1_uma) {
2965
3025
  d_Qy = extra_src1->buffer_gpu.lock();
2966
- qy_buf_offset = extra_src1->offset;
3026
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
2967
3027
  GGML_ASSERT(d_Qy != nullptr);
2968
3028
  }
2969
3029
  if (qx_needs_dequant) {
@@ -3114,7 +3174,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3114
3174
  const uint64_t d_sz = sizeof(float) * d_ne;
3115
3175
 
3116
3176
  vk_buffer d_D = extra->buffer_gpu.lock();
3117
- const uint64_t d_buf_offset = extra->offset;
3177
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3118
3178
  GGML_ASSERT(d_D != nullptr);
3119
3179
  vk_buffer d_X;
3120
3180
  uint64_t x_buf_offset = 0;
@@ -3122,12 +3182,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3122
3182
  uint64_t y_buf_offset = 0;
3123
3183
  if(!src0_uma) {
3124
3184
  d_Qx = extra_src0->buffer_gpu.lock();
3125
- qx_buf_offset = extra_src0->offset;
3185
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3126
3186
  GGML_ASSERT(d_Qx != nullptr);
3127
3187
  }
3128
3188
  if(!src1_uma) {
3129
3189
  d_Qy = extra_src1->buffer_gpu.lock();
3130
- qy_buf_offset = extra_src1->offset;
3190
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3131
3191
  GGML_ASSERT(d_Qy != nullptr);
3132
3192
  }
3133
3193
  if (qx_needs_dequant) {
@@ -3246,14 +3306,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3246
3306
  const uint64_t d_sz = sizeof(float) * d_ne;
3247
3307
 
3248
3308
  vk_buffer d_D = extra->buffer_gpu.lock();
3249
- const uint64_t d_buf_offset = extra->offset;
3309
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3250
3310
  GGML_ASSERT(d_D != nullptr);
3251
3311
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3252
- const uint64_t qx_buf_offset = extra_src0->offset;
3312
+ const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3253
3313
  GGML_ASSERT(d_Qx != nullptr);
3254
3314
  if (!src1_uma) {
3255
3315
  d_Qy = extra_src1->buffer_gpu.lock();
3256
- qy_buf_offset = extra_src1->offset;
3316
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3257
3317
  GGML_ASSERT(d_Qx != nullptr);
3258
3318
  }
3259
3319
 
@@ -3323,14 +3383,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3323
3383
  const uint64_t d_sz = sizeof(float) * d_ne;
3324
3384
 
3325
3385
  vk_buffer d_D = extra->buffer_gpu.lock();
3326
- const uint64_t d_buf_offset = extra->offset;
3386
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3327
3387
  GGML_ASSERT(d_D != nullptr);
3328
3388
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3329
- const uint64_t qx_buf_offset = extra_src0->offset;
3389
+ const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3330
3390
  GGML_ASSERT(d_Qx != nullptr);
3331
3391
  if (!src1_uma) {
3332
3392
  d_Qy = extra_src1->buffer_gpu.lock();
3333
- qy_buf_offset = extra_src1->offset;
3393
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3334
3394
  GGML_ASSERT(d_Qx != nullptr);
3335
3395
  }
3336
3396
 
@@ -3459,7 +3519,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3459
3519
  const uint64_t d_sz = sizeof(float) * d_ne;
3460
3520
 
3461
3521
  vk_buffer d_D = extra->buffer_gpu.lock();
3462
- const uint64_t d_buf_offset = extra->offset;
3522
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3463
3523
  GGML_ASSERT(d_D != nullptr);
3464
3524
  vk_buffer d_X;
3465
3525
  uint64_t x_buf_offset = 0;
@@ -3467,17 +3527,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3467
3527
  uint64_t y_buf_offset = 0;
3468
3528
  if (!src0_uma) {
3469
3529
  d_Qx = extra_src0->buffer_gpu.lock();
3470
- qx_buf_offset = extra_src0->offset;
3530
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3471
3531
  GGML_ASSERT(d_Qx != nullptr);
3472
3532
  }
3473
3533
  if (!src1_uma) {
3474
3534
  d_Qy = extra_src1->buffer_gpu.lock();
3475
- qy_buf_offset = extra_src1->offset;
3535
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3476
3536
  GGML_ASSERT(d_Qy != nullptr);
3477
3537
  }
3478
3538
  if (!ids_uma) {
3479
3539
  d_ids = extra_ids->buffer_gpu.lock();
3480
- ids_buf_offset = extra_ids->offset;
3540
+ ids_buf_offset = extra_ids->offset + ids->view_offs;
3481
3541
  GGML_ASSERT(d_ids != nullptr);
3482
3542
  }
3483
3543
  if (qx_needs_dequant) {
@@ -3636,7 +3696,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3636
3696
  const uint64_t d_sz = sizeof(float) * d_ne;
3637
3697
 
3638
3698
  vk_buffer d_D = extra->buffer_gpu.lock();
3639
- const uint64_t d_buf_offset = extra->offset;
3699
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3640
3700
  GGML_ASSERT(d_D != nullptr);
3641
3701
  vk_buffer d_X;
3642
3702
  uint64_t x_buf_offset = 0;
@@ -3644,17 +3704,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3644
3704
  uint64_t y_buf_offset = 0;
3645
3705
  if(!src0_uma) {
3646
3706
  d_Qx = extra_src0->buffer_gpu.lock();
3647
- qx_buf_offset = extra_src0->offset;
3707
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3648
3708
  GGML_ASSERT(d_Qx != nullptr);
3649
3709
  }
3650
3710
  if(!src1_uma) {
3651
3711
  d_Qy = extra_src1->buffer_gpu.lock();
3652
- qy_buf_offset = extra_src1->offset;
3712
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3653
3713
  GGML_ASSERT(d_Qy != nullptr);
3654
3714
  }
3655
3715
  if(!ids_uma) {
3656
3716
  d_ids = extra_ids->buffer_gpu.lock();
3657
- ids_buf_offset = extra_ids->offset;
3717
+ ids_buf_offset = extra_ids->offset + ids->view_offs;
3658
3718
  GGML_ASSERT(d_ids != nullptr);
3659
3719
  }
3660
3720
  if (qx_needs_dequant) {
@@ -3769,9 +3829,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3769
3829
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3770
3830
 
3771
3831
  const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
3772
- const uint64_t src_offset = extra_src0->offset;
3832
+ const uint64_t src_offset = extra_src0->offset + src0->view_offs;
3773
3833
  vk_buffer dst_buf = extra->buffer_gpu.lock();
3774
- const uint64_t dst_offset = extra->offset;
3834
+ const uint64_t dst_offset = extra->offset + dst->view_offs;
3775
3835
 
3776
3836
  std::vector<vk::BufferCopy> copies;
3777
3837
 
@@ -3908,10 +3968,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3908
3968
  }
3909
3969
  } else {
3910
3970
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3911
- return ctx->device->pipeline_rope_f32;
3971
+ return ctx->device->pipeline_rope_norm_f32;
3912
3972
  }
3913
3973
  if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3914
- return ctx->device->pipeline_rope_f16;
3974
+ return ctx->device->pipeline_rope_norm_f16;
3915
3975
  }
3916
3976
  }
3917
3977
  return nullptr;
@@ -4062,21 +4122,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4062
4122
  }
4063
4123
 
4064
4124
  GGML_ASSERT(d_D != nullptr);
4065
- uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4125
+ uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4066
4126
  GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
4067
4127
  if(!src0_uma) {
4068
4128
  d_X = extra_src0->buffer_gpu.lock();
4069
- x_buf_offset = extra_src0->offset;
4129
+ x_buf_offset = extra_src0->offset + src0->view_offs;
4070
4130
  GGML_ASSERT(d_X != nullptr);
4071
4131
  }
4072
4132
  if (use_src1 && !src1_uma) {
4073
4133
  d_Y = extra_src1->buffer_gpu.lock();
4074
- y_buf_offset = extra_src1->offset;
4134
+ y_buf_offset = extra_src1->offset + src1->view_offs;
4075
4135
  GGML_ASSERT(d_Y != nullptr);
4076
4136
  }
4077
4137
  if (use_src2 && !src2_uma) {
4078
4138
  d_Z = extra_src2->buffer_gpu.lock();
4079
- z_buf_offset = extra_src2->offset;
4139
+ z_buf_offset = extra_src2->offset + src2->view_offs;
4080
4140
  GGML_ASSERT(d_Z != nullptr);
4081
4141
  }
4082
4142
 
@@ -4155,24 +4215,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4155
4215
  ggml_vk_sync_buffers(subctx);
4156
4216
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4157
4217
  } else if (op == GGML_OP_ROPE) {
4158
- const int mode = ((int32_t *) dst->op_params)[2];
4159
- const bool is_neox = mode & 2;
4160
-
4161
- if (is_neox) {
4162
- // Empty src2 is possible in rope, but the shader needs a buffer
4163
- vk_subbuffer subbuf_z;
4164
- if (use_src2) {
4165
- subbuf_z = { d_Z, z_buf_offset, z_sz };
4166
- } else {
4167
- subbuf_z = { d_X, 0, d_X->size };
4168
- }
4169
-
4170
- ggml_vk_sync_buffers(subctx);
4171
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4218
+ // Empty src2 is possible in rope, but the shader needs a buffer
4219
+ vk_subbuffer subbuf_z;
4220
+ if (use_src2) {
4221
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
4172
4222
  } else {
4173
- ggml_vk_sync_buffers(subctx);
4174
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4223
+ subbuf_z = { d_X, 0, d_X->size };
4175
4224
  }
4225
+
4226
+ ggml_vk_sync_buffers(subctx);
4227
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4176
4228
  } else if (use_src2) {
4177
4229
  ggml_vk_sync_buffers(subctx);
4178
4230
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4336,7 +4388,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4336
4388
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4337
4389
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4338
4390
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4339
- const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4391
+ const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4340
4392
 
4341
4393
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4342
4394
  (uint32_t)ggml_nelements(src0),
@@ -4394,7 +4446,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4394
4446
 
4395
4447
  static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4396
4448
  const int n_dims = ((int32_t *) dst->op_params)[1];
4397
- const int mode = ((int32_t *) dst->op_params)[2];
4449
+ // const int mode = ((int32_t *) dst->op_params)[2];
4398
4450
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
4399
4451
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
4400
4452
  const float freq_base = ((float *) dst->op_params)[5];
@@ -4404,28 +4456,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4404
4456
  const float beta_fast = ((float *) dst->op_params)[9];
4405
4457
  const float beta_slow = ((float *) dst->op_params)[10];
4406
4458
 
4407
- const bool is_neox = mode & 2;
4408
-
4409
- #pragma message("TODO: update rope NORM mode to match NEOX mode")
4410
- #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
4411
-
4412
4459
  float corr_dims[2];
4413
4460
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
4414
4461
 
4415
- if (is_neox) {
4416
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
4417
- const float inv_ndims = -1.0f / n_dims;
4418
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4419
- (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4420
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
4421
- src2 != nullptr,
4422
- });
4423
- } else {
4424
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4425
- (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4426
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4427
- });
4428
- }
4462
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
4463
+
4464
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4465
+ (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4466
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
4467
+ src2 != nullptr,
4468
+ });
4429
4469
  }
4430
4470
 
4431
4471
  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -5569,6 +5609,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5569
5609
  const ggml_tensor * src2 = node->src[2];
5570
5610
 
5571
5611
  switch (node->op) {
5612
+ // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
5613
+ case GGML_OP_RESHAPE:
5614
+ case GGML_OP_VIEW:
5615
+ case GGML_OP_PERMUTE:
5616
+ case GGML_OP_TRANSPOSE:
5617
+ case GGML_OP_NONE:
5618
+ return;
5572
5619
  case GGML_OP_UNARY:
5573
5620
  switch (ggml_get_unary_op(node)) {
5574
5621
  case GGML_UNARY_OP_SILU:
@@ -5590,10 +5637,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5590
5637
  case GGML_OP_CPY:
5591
5638
  case GGML_OP_CONT:
5592
5639
  case GGML_OP_DUP:
5593
- case GGML_OP_RESHAPE:
5594
- case GGML_OP_VIEW:
5595
- case GGML_OP_PERMUTE:
5596
- case GGML_OP_TRANSPOSE:
5597
5640
  case GGML_OP_NORM:
5598
5641
  case GGML_OP_RMS_NORM:
5599
5642
  case GGML_OP_DIAG_MASK_INF:
@@ -5601,7 +5644,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5601
5644
  case GGML_OP_ROPE:
5602
5645
  case GGML_OP_MUL_MAT:
5603
5646
  case GGML_OP_MUL_MAT_ID:
5604
- case GGML_OP_NONE:
5605
5647
  case GGML_OP_ARGSORT:
5606
5648
  case GGML_OP_SUM_ROWS:
5607
5649
  break;
@@ -5654,12 +5696,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5654
5696
  case GGML_OP_DUP:
5655
5697
  ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
5656
5698
 
5657
- break;
5658
- case GGML_OP_RESHAPE:
5659
- case GGML_OP_VIEW:
5660
- case GGML_OP_PERMUTE:
5661
- case GGML_OP_TRANSPOSE:
5662
- case GGML_OP_NONE:
5663
5699
  break;
5664
5700
  case GGML_OP_NORM:
5665
5701
  ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@@ -5712,7 +5748,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5712
5748
  return;
5713
5749
  }
5714
5750
 
5715
- extra->ready = true;
5716
5751
  extra->ctx_idx = ctx->compute_ctx->idx;
5717
5752
 
5718
5753
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5796,8 +5831,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5796
5831
  ggml_vk_check_results_0(ctx, params, tensor);
5797
5832
  #endif
5798
5833
 
5799
- GGML_ASSERT(extra->ready);
5800
-
5801
5834
  vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
5802
5835
 
5803
5836
  // Only run if ctx hasn't been submitted yet
@@ -5822,8 +5855,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5822
5855
  subctx.out_memcpys.clear();
5823
5856
  }
5824
5857
 
5825
- extra->ready = false;
5826
-
5827
5858
  return true;
5828
5859
  }
5829
5860
 
@@ -5943,7 +5974,9 @@ struct ggml_backend_vk_buffer_context {
5943
5974
 
5944
5975
  ~ggml_backend_vk_buffer_context() {
5945
5976
  ggml_vk_destroy_buffer(dev_buffer);
5946
- delete[] temp_tensor_extras;
5977
+ if (temp_tensor_extras != nullptr) {
5978
+ delete[] temp_tensor_extras;
5979
+ }
5947
5980
  }
5948
5981
 
5949
5982
  ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
@@ -5990,18 +6023,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
5990
6023
  #endif
5991
6024
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5992
6025
 
5993
- ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
5994
- if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
6026
+ if (tensor->view_src != nullptr) {
5995
6027
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
5996
- ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra;
5997
- extra->buffer_gpu = extra_view->buffer_gpu;
5998
- extra->offset = extra_view->offset + tensor->view_offs;
6028
+ GGML_ASSERT(tensor->view_src->extra != nullptr);
6029
+ tensor->extra = tensor->view_src->extra;
5999
6030
  } else {
6031
+ ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
6000
6032
  extra->buffer_gpu = ctx->dev_buffer;
6001
6033
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
6034
+ tensor->extra = extra;
6002
6035
  }
6003
-
6004
- tensor->extra = extra;
6005
6036
  }
6006
6037
 
6007
6038
  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -6014,7 +6045,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
6014
6045
 
6015
6046
  vk_buffer buf = extra->buffer_gpu.lock();
6016
6047
 
6017
- ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
6048
+ ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6018
6049
  }
6019
6050
 
6020
6051
  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6027,7 +6058,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
6027
6058
 
6028
6059
  vk_buffer buf = extra->buffer_gpu.lock();
6029
6060
 
6030
- ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
6061
+ ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6031
6062
  }
6032
6063
 
6033
6064
  GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6038,7 +6069,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
6038
6069
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6039
6070
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6040
6071
 
6041
- ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
6072
+ ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6042
6073
 
6043
6074
  return true;
6044
6075
  }
@@ -6082,7 +6113,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
6082
6113
  std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6083
6114
  #endif
6084
6115
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6085
- vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
6116
+
6117
+ vk_buffer dev_buffer = nullptr;
6118
+ try {
6119
+ dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
6120
+ } catch (const vk::SystemError& e) {
6121
+ return nullptr;
6122
+ }
6086
6123
 
6087
6124
  ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
6088
6125
 
@@ -6105,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
6105
6142
  UNUSED(buft);
6106
6143
  }
6107
6144
 
6108
- GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
6109
- if (!ggml_backend_is_vk(backend)) {
6110
- return false;
6111
- }
6112
-
6113
- ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6114
- ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6115
-
6116
- return buft_ctx->ctx->idx == ctx->idx;
6117
- }
6118
-
6119
6145
  static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6120
6146
  /* .get_name = */ ggml_backend_vk_buffer_type_name,
6121
6147
  /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
6122
6148
  /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
6123
6149
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
6124
6150
  /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
6125
- /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
6126
6151
  /* .is_host = */ NULL,
6127
6152
  };
6128
6153
 
@@ -6198,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6198
6223
  /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
6199
6224
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
6200
6225
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
6201
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
6202
6226
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
6203
6227
  },
6204
6228
  /* .context = */ nullptr,
@@ -6264,7 +6288,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6264
6288
 
6265
6289
  vk_buffer buf = extra->buffer_gpu.lock();
6266
6290
 
6267
- ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
6291
+ ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6268
6292
  }
6269
6293
 
6270
6294
  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6284,7 +6308,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6284
6308
 
6285
6309
  vk_buffer buf = extra->buffer_gpu.lock();
6286
6310
 
6287
- ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
6311
+ ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6288
6312
  }
6289
6313
 
6290
6314
  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6305,7 +6329,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6305
6329
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6306
6330
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6307
6331
 
6308
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
6332
+ ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6309
6333
  return true;
6310
6334
  }
6311
6335
 
@@ -6402,7 +6426,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6402
6426
  case GGML_UNARY_OP_GELU:
6403
6427
  case GGML_UNARY_OP_SILU:
6404
6428
  case GGML_UNARY_OP_RELU:
6405
- return true;
6429
+ return ggml_is_contiguous(op->src[0]);
6406
6430
  default:
6407
6431
  return false;
6408
6432
  }
@@ -6478,11 +6502,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6478
6502
  // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
6479
6503
  // } break;
6480
6504
  case GGML_OP_ROPE:
6481
- {
6482
- const int mode = ((const int32_t *) op->op_params)[2];
6483
-
6484
- return true;
6485
- } break;
6505
+ return ggml_is_contiguous(op->src[0]);
6486
6506
  case GGML_OP_NONE:
6487
6507
  case GGML_OP_RESHAPE:
6488
6508
  case GGML_OP_VIEW:
@@ -6518,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
6518
6538
  UNUSED(backend);
6519
6539
  }
6520
6540
 
6541
+ GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
6542
+ if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
6543
+ return false;
6544
+ }
6545
+
6546
+ ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6547
+ ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6548
+
6549
+ return buft_ctx->ctx->idx == ctx->idx;
6550
+ }
6551
+
6521
6552
  // TODO: enable async and synchronize
6522
6553
  static ggml_backend_i ggml_backend_vk_interface = {
6523
6554
  /* .get_name = */ ggml_backend_vk_name,
@@ -6529,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
6529
6560
  /* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
6530
6561
  /* .graph_plan_create = */ NULL,
6531
6562
  /* .graph_plan_free = */ NULL,
6563
+ /* .graph_plan_update = */ NULL,
6532
6564
  /* .graph_plan_compute = */ NULL,
6533
6565
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
6534
6566
  /* .supports_op = */ ggml_backend_vk_supports_op,
6567
+ /* .supports_buft = */ ggml_backend_vk_supports_buft,
6535
6568
  /* .offload_op = */ ggml_backend_vk_offload_op,
6536
6569
  /* .event_new = */ NULL,
6537
6570
  /* .event_free = */ NULL,
@@ -6725,7 +6758,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6725
6758
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6726
6759
 
6727
6760
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6728
- ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
6761
+ ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
6729
6762
  }
6730
6763
 
6731
6764
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@@ -6809,7 +6842,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6809
6842
  } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6810
6843
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6811
6844
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6812
- uint64_t offset = extra->offset;
6845
+ uint64_t offset = extra->offset + src0->view_offs;
6813
6846
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
6814
6847
  for (int i3 = 0; i3 < src0->ne[3]; i3++) {
6815
6848
  for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@@ -6851,7 +6884,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6851
6884
  } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6852
6885
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6853
6886
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6854
- uint64_t offset = extra->offset;
6887
+ uint64_t offset = extra->offset + src1->view_offs;
6855
6888
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
6856
6889
  for (int i3 = 0; i3 < src1->ne[3]; i3++) {
6857
6890
  for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@@ -6909,7 +6942,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6909
6942
  } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6910
6943
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6911
6944
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6912
- uint64_t offset = extra->offset;
6945
+ uint64_t offset = extra->offset + src2->view_offs;
6913
6946
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6914
6947
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6915
6948
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@@ -7092,11 +7125,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7092
7125
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7093
7126
 
7094
7127
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7095
- if (extra->offset + tensor_size >= buffer_gpu->size) {
7096
- tensor_size = buffer_gpu->size - (extra->offset);
7128
+ if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
7129
+ tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
7097
7130
  }
7098
7131
 
7099
- ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
7132
+ ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
7100
7133
  }
7101
7134
 
7102
7135
  float first_error_result = -1.0f;