llama_cpp 0.16.0 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -1,5 +1,5 @@
1
1
  #include "ggml-vulkan.h"
2
-
2
+ #include <vulkan/vulkan_core.h>
3
3
  #ifdef GGML_VULKAN_RUN_TESTS
4
4
  #include <chrono>
5
5
  #endif
@@ -9,12 +9,13 @@
9
9
  #include <algorithm>
10
10
  #include <cmath>
11
11
  #include <iostream>
12
- #include <limits>
13
12
  #include <tuple>
14
13
  #include <vector>
15
14
  #include <sstream>
16
15
  #include <utility>
17
16
  #include <memory>
17
+ #include <limits>
18
+ #include <map>
18
19
 
19
20
  #include "ggml.h"
20
21
  #include "ggml-backend-impl.h"
@@ -150,7 +151,7 @@ struct vk_device {
150
151
  vk_pipeline pipeline_relu_f32;
151
152
  vk_pipeline pipeline_diag_mask_inf_f32;
152
153
  vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
153
- vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
154
+ vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
154
155
  vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
155
156
  vk_pipeline pipeline_argsort_f32;
156
157
  vk_pipeline pipeline_sum_rows_f32;
@@ -283,26 +284,15 @@ struct vk_op_diag_mask_push_constants {
283
284
 
284
285
  struct vk_op_rope_push_constants {
285
286
  uint32_t ncols;
287
+ uint32_t n_dims;
286
288
  float freq_scale;
287
289
  uint32_t p_delta_rows;
288
290
  float freq_base;
289
291
  float ext_factor;
290
292
  float attn_factor;
291
- float corr_dims[4];
292
- };
293
-
294
- struct vk_op_rope_neox_push_constants {
295
- uint32_t ncols;
296
- uint32_t ndims;
297
- float freq_scale;
298
- uint32_t p_delta_rows;
299
- float freq_base;
300
- float ext_factor;
301
- float attn_factor;
302
- float corr_dims[4];
293
+ float corr_dims[2];
303
294
  float theta_scale;
304
- float inv_ndims;
305
- uint32_t has_freq_facs;
295
+ uint32_t has_ff;
306
296
  };
307
297
 
308
298
  struct vk_op_soft_max_push_constants {
@@ -345,15 +335,12 @@ struct vk_context {
345
335
  };
346
336
 
347
337
  struct ggml_tensor_extra_gpu {
348
- bool ready;
349
-
350
338
  size_t ctx_idx;
351
339
 
352
340
  vk_buffer_ref buffer_gpu;
353
341
  uint64_t offset;
354
342
 
355
343
  void reset() {
356
- ready = false;
357
344
  ctx_idx = 0;
358
345
  buffer_gpu.reset();
359
346
  offset = 0;
@@ -1537,11 +1524,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1537
1524
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1538
1525
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1539
1526
 
1540
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1541
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1528
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1542
1529
 
1543
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1544
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1530
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1531
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1545
1532
 
1546
1533
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1547
1534
 
@@ -1569,8 +1556,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1569
1556
  vk::PhysicalDeviceProperties2 props2;
1570
1557
  vk::PhysicalDeviceMaintenance3Properties props3;
1571
1558
  vk::PhysicalDeviceSubgroupProperties subgroup_props;
1559
+ vk::PhysicalDeviceDriverProperties driver_props;
1572
1560
  props2.pNext = &props3;
1573
1561
  props3.pNext = &subgroup_props;
1562
+ subgroup_props.pNext = &driver_props;
1574
1563
  physical_device.getProperties2(&props2);
1575
1564
 
1576
1565
  const size_t subgroup_size = subgroup_props.subgroupSize;
@@ -1614,7 +1603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1614
1603
  fp16 = fp16 && vk12_features.shaderFloat16;
1615
1604
 
1616
1605
  std::string device_name = props2.properties.deviceName.data();
1617
- std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1606
+ std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1618
1607
 
1619
1608
  if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
1620
1609
  std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
@@ -1710,7 +1699,78 @@ void ggml_vk_instance_init() {
1710
1699
  vk::PhysicalDeviceProperties props = devices[i].getProperties();
1711
1700
 
1712
1701
  if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1713
- vk_instance.device_indices.push_back(i);
1702
+ // Check if there are two physical devices corresponding to the same GPU
1703
+ auto old_device = std::find_if(
1704
+ vk_instance.device_indices.begin(),
1705
+ vk_instance.device_indices.end(),
1706
+ [&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
1707
+ );
1708
+ if (old_device == vk_instance.device_indices.end()) {
1709
+ vk_instance.device_indices.push_back(i);
1710
+ } else {
1711
+ // There can be two physical devices corresponding to the same GPU if there are 2 different drivers
1712
+ // This can cause error when splitting layers aross the devices, need to keep only 1
1713
+ #ifdef GGML_VULKAN_DEBUG
1714
+ std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
1715
+ #endif
1716
+
1717
+ vk::PhysicalDeviceProperties2 old_prop;
1718
+ vk::PhysicalDeviceDriverProperties old_driver;
1719
+ old_prop.pNext = &old_driver;
1720
+ devices[*old_device].getProperties2(&old_prop);
1721
+
1722
+ vk::PhysicalDeviceProperties2 new_prop;
1723
+ vk::PhysicalDeviceDriverProperties new_driver;
1724
+ new_prop.pNext = &new_driver;
1725
+ devices[i].getProperties2(&new_prop);
1726
+
1727
+ std::map<vk::DriverId, int> driver_priorities {};
1728
+ int old_priority = std::numeric_limits<int>::max();
1729
+ int new_priority = std::numeric_limits<int>::max();
1730
+
1731
+ // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
1732
+ // Smaller number -> higher priority
1733
+ switch (old_prop.properties.vendorID) {
1734
+ case VK_VENDOR_ID_AMD:
1735
+ driver_priorities[vk::DriverId::eMesaRadv] = 1;
1736
+ driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
1737
+ driver_priorities[vk::DriverId::eAmdProprietary] = 3;
1738
+ break;
1739
+ case VK_VENDOR_ID_INTEL:
1740
+ driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
1741
+ driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
1742
+ break;
1743
+ case VK_VENDOR_ID_NVIDIA:
1744
+ driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
1745
+ #if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
1746
+ driver_priorities[vk::DriverId::eMesaNvk] = 2;
1747
+ #endif
1748
+ break;
1749
+ }
1750
+
1751
+ if (driver_priorities.count(old_driver.driverID)) {
1752
+ old_priority = driver_priorities[old_driver.driverID];
1753
+ }
1754
+ if (driver_priorities.count(new_driver.driverID)) {
1755
+ new_priority = driver_priorities[new_driver.driverID];
1756
+ }
1757
+
1758
+ if (new_priority < old_priority) {
1759
+ auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
1760
+ vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
1761
+ vk_instance.device_indices.push_back(i);
1762
+
1763
+ #ifdef GGML_VULKAN_DEBUG
1764
+ std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
1765
+ #endif
1766
+ }
1767
+ #ifdef GGML_VULKAN_DEBUG
1768
+ else {
1769
+ std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
1770
+
1771
+ }
1772
+ #endif
1773
+ }
1714
1774
  }
1715
1775
  }
1716
1776
 
@@ -2949,7 +3009,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2949
3009
  const uint64_t d_sz = sizeof(float) * d_ne;
2950
3010
 
2951
3011
  vk_buffer d_D = extra->buffer_gpu.lock();
2952
- const uint64_t d_buf_offset = extra->offset;
3012
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
2953
3013
  GGML_ASSERT(d_D != nullptr);
2954
3014
  GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
2955
3015
  vk_buffer d_X;
@@ -2958,12 +3018,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2958
3018
  uint64_t y_buf_offset = 0;
2959
3019
  if (!src0_uma) {
2960
3020
  d_Qx = extra_src0->buffer_gpu.lock();
2961
- qx_buf_offset = extra_src0->offset;
3021
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
2962
3022
  GGML_ASSERT(d_Qx != nullptr);
2963
3023
  }
2964
3024
  if (!src1_uma) {
2965
3025
  d_Qy = extra_src1->buffer_gpu.lock();
2966
- qy_buf_offset = extra_src1->offset;
3026
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
2967
3027
  GGML_ASSERT(d_Qy != nullptr);
2968
3028
  }
2969
3029
  if (qx_needs_dequant) {
@@ -3114,7 +3174,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3114
3174
  const uint64_t d_sz = sizeof(float) * d_ne;
3115
3175
 
3116
3176
  vk_buffer d_D = extra->buffer_gpu.lock();
3117
- const uint64_t d_buf_offset = extra->offset;
3177
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3118
3178
  GGML_ASSERT(d_D != nullptr);
3119
3179
  vk_buffer d_X;
3120
3180
  uint64_t x_buf_offset = 0;
@@ -3122,12 +3182,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3122
3182
  uint64_t y_buf_offset = 0;
3123
3183
  if(!src0_uma) {
3124
3184
  d_Qx = extra_src0->buffer_gpu.lock();
3125
- qx_buf_offset = extra_src0->offset;
3185
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3126
3186
  GGML_ASSERT(d_Qx != nullptr);
3127
3187
  }
3128
3188
  if(!src1_uma) {
3129
3189
  d_Qy = extra_src1->buffer_gpu.lock();
3130
- qy_buf_offset = extra_src1->offset;
3190
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3131
3191
  GGML_ASSERT(d_Qy != nullptr);
3132
3192
  }
3133
3193
  if (qx_needs_dequant) {
@@ -3246,14 +3306,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3246
3306
  const uint64_t d_sz = sizeof(float) * d_ne;
3247
3307
 
3248
3308
  vk_buffer d_D = extra->buffer_gpu.lock();
3249
- const uint64_t d_buf_offset = extra->offset;
3309
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3250
3310
  GGML_ASSERT(d_D != nullptr);
3251
3311
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3252
- const uint64_t qx_buf_offset = extra_src0->offset;
3312
+ const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3253
3313
  GGML_ASSERT(d_Qx != nullptr);
3254
3314
  if (!src1_uma) {
3255
3315
  d_Qy = extra_src1->buffer_gpu.lock();
3256
- qy_buf_offset = extra_src1->offset;
3316
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3257
3317
  GGML_ASSERT(d_Qx != nullptr);
3258
3318
  }
3259
3319
 
@@ -3323,14 +3383,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3323
3383
  const uint64_t d_sz = sizeof(float) * d_ne;
3324
3384
 
3325
3385
  vk_buffer d_D = extra->buffer_gpu.lock();
3326
- const uint64_t d_buf_offset = extra->offset;
3386
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3327
3387
  GGML_ASSERT(d_D != nullptr);
3328
3388
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
3329
- const uint64_t qx_buf_offset = extra_src0->offset;
3389
+ const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
3330
3390
  GGML_ASSERT(d_Qx != nullptr);
3331
3391
  if (!src1_uma) {
3332
3392
  d_Qy = extra_src1->buffer_gpu.lock();
3333
- qy_buf_offset = extra_src1->offset;
3393
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3334
3394
  GGML_ASSERT(d_Qx != nullptr);
3335
3395
  }
3336
3396
 
@@ -3459,7 +3519,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3459
3519
  const uint64_t d_sz = sizeof(float) * d_ne;
3460
3520
 
3461
3521
  vk_buffer d_D = extra->buffer_gpu.lock();
3462
- const uint64_t d_buf_offset = extra->offset;
3522
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3463
3523
  GGML_ASSERT(d_D != nullptr);
3464
3524
  vk_buffer d_X;
3465
3525
  uint64_t x_buf_offset = 0;
@@ -3467,17 +3527,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3467
3527
  uint64_t y_buf_offset = 0;
3468
3528
  if (!src0_uma) {
3469
3529
  d_Qx = extra_src0->buffer_gpu.lock();
3470
- qx_buf_offset = extra_src0->offset;
3530
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3471
3531
  GGML_ASSERT(d_Qx != nullptr);
3472
3532
  }
3473
3533
  if (!src1_uma) {
3474
3534
  d_Qy = extra_src1->buffer_gpu.lock();
3475
- qy_buf_offset = extra_src1->offset;
3535
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3476
3536
  GGML_ASSERT(d_Qy != nullptr);
3477
3537
  }
3478
3538
  if (!ids_uma) {
3479
3539
  d_ids = extra_ids->buffer_gpu.lock();
3480
- ids_buf_offset = extra_ids->offset;
3540
+ ids_buf_offset = extra_ids->offset + ids->view_offs;
3481
3541
  GGML_ASSERT(d_ids != nullptr);
3482
3542
  }
3483
3543
  if (qx_needs_dequant) {
@@ -3636,7 +3696,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3636
3696
  const uint64_t d_sz = sizeof(float) * d_ne;
3637
3697
 
3638
3698
  vk_buffer d_D = extra->buffer_gpu.lock();
3639
- const uint64_t d_buf_offset = extra->offset;
3699
+ const uint64_t d_buf_offset = extra->offset + dst->view_offs;
3640
3700
  GGML_ASSERT(d_D != nullptr);
3641
3701
  vk_buffer d_X;
3642
3702
  uint64_t x_buf_offset = 0;
@@ -3644,17 +3704,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3644
3704
  uint64_t y_buf_offset = 0;
3645
3705
  if(!src0_uma) {
3646
3706
  d_Qx = extra_src0->buffer_gpu.lock();
3647
- qx_buf_offset = extra_src0->offset;
3707
+ qx_buf_offset = extra_src0->offset + src0->view_offs;
3648
3708
  GGML_ASSERT(d_Qx != nullptr);
3649
3709
  }
3650
3710
  if(!src1_uma) {
3651
3711
  d_Qy = extra_src1->buffer_gpu.lock();
3652
- qy_buf_offset = extra_src1->offset;
3712
+ qy_buf_offset = extra_src1->offset + src1->view_offs;
3653
3713
  GGML_ASSERT(d_Qy != nullptr);
3654
3714
  }
3655
3715
  if(!ids_uma) {
3656
3716
  d_ids = extra_ids->buffer_gpu.lock();
3657
- ids_buf_offset = extra_ids->offset;
3717
+ ids_buf_offset = extra_ids->offset + ids->view_offs;
3658
3718
  GGML_ASSERT(d_ids != nullptr);
3659
3719
  }
3660
3720
  if (qx_needs_dequant) {
@@ -3769,9 +3829,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3769
3829
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3770
3830
 
3771
3831
  const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
3772
- const uint64_t src_offset = extra_src0->offset;
3832
+ const uint64_t src_offset = extra_src0->offset + src0->view_offs;
3773
3833
  vk_buffer dst_buf = extra->buffer_gpu.lock();
3774
- const uint64_t dst_offset = extra->offset;
3834
+ const uint64_t dst_offset = extra->offset + dst->view_offs;
3775
3835
 
3776
3836
  std::vector<vk::BufferCopy> copies;
3777
3837
 
@@ -3908,10 +3968,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3908
3968
  }
3909
3969
  } else {
3910
3970
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3911
- return ctx->device->pipeline_rope_f32;
3971
+ return ctx->device->pipeline_rope_norm_f32;
3912
3972
  }
3913
3973
  if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3914
- return ctx->device->pipeline_rope_f16;
3974
+ return ctx->device->pipeline_rope_norm_f16;
3915
3975
  }
3916
3976
  }
3917
3977
  return nullptr;
@@ -4062,21 +4122,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4062
4122
  }
4063
4123
 
4064
4124
  GGML_ASSERT(d_D != nullptr);
4065
- uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4125
+ uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
4066
4126
  GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
4067
4127
  if(!src0_uma) {
4068
4128
  d_X = extra_src0->buffer_gpu.lock();
4069
- x_buf_offset = extra_src0->offset;
4129
+ x_buf_offset = extra_src0->offset + src0->view_offs;
4070
4130
  GGML_ASSERT(d_X != nullptr);
4071
4131
  }
4072
4132
  if (use_src1 && !src1_uma) {
4073
4133
  d_Y = extra_src1->buffer_gpu.lock();
4074
- y_buf_offset = extra_src1->offset;
4134
+ y_buf_offset = extra_src1->offset + src1->view_offs;
4075
4135
  GGML_ASSERT(d_Y != nullptr);
4076
4136
  }
4077
4137
  if (use_src2 && !src2_uma) {
4078
4138
  d_Z = extra_src2->buffer_gpu.lock();
4079
- z_buf_offset = extra_src2->offset;
4139
+ z_buf_offset = extra_src2->offset + src2->view_offs;
4080
4140
  GGML_ASSERT(d_Z != nullptr);
4081
4141
  }
4082
4142
 
@@ -4155,24 +4215,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4155
4215
  ggml_vk_sync_buffers(subctx);
4156
4216
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4157
4217
  } else if (op == GGML_OP_ROPE) {
4158
- const int mode = ((int32_t *) dst->op_params)[2];
4159
- const bool is_neox = mode & 2;
4160
-
4161
- if (is_neox) {
4162
- // Empty src2 is possible in rope, but the shader needs a buffer
4163
- vk_subbuffer subbuf_z;
4164
- if (use_src2) {
4165
- subbuf_z = { d_Z, z_buf_offset, z_sz };
4166
- } else {
4167
- subbuf_z = { d_X, 0, d_X->size };
4168
- }
4169
-
4170
- ggml_vk_sync_buffers(subctx);
4171
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4218
+ // Empty src2 is possible in rope, but the shader needs a buffer
4219
+ vk_subbuffer subbuf_z;
4220
+ if (use_src2) {
4221
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
4172
4222
  } else {
4173
- ggml_vk_sync_buffers(subctx);
4174
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4223
+ subbuf_z = { d_X, 0, d_X->size };
4175
4224
  }
4225
+
4226
+ ggml_vk_sync_buffers(subctx);
4227
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4176
4228
  } else if (use_src2) {
4177
4229
  ggml_vk_sync_buffers(subctx);
4178
4230
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4336,7 +4388,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4336
4388
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
4337
4389
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4338
4390
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4339
- const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4391
+ const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4340
4392
 
4341
4393
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4342
4394
  (uint32_t)ggml_nelements(src0),
@@ -4394,7 +4446,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4394
4446
 
4395
4447
  static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4396
4448
  const int n_dims = ((int32_t *) dst->op_params)[1];
4397
- const int mode = ((int32_t *) dst->op_params)[2];
4449
+ // const int mode = ((int32_t *) dst->op_params)[2];
4398
4450
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
4399
4451
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
4400
4452
  const float freq_base = ((float *) dst->op_params)[5];
@@ -4404,28 +4456,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4404
4456
  const float beta_fast = ((float *) dst->op_params)[9];
4405
4457
  const float beta_slow = ((float *) dst->op_params)[10];
4406
4458
 
4407
- const bool is_neox = mode & 2;
4408
-
4409
- #pragma message("TODO: update rope NORM mode to match NEOX mode")
4410
- #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
4411
-
4412
4459
  float corr_dims[2];
4413
4460
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
4414
4461
 
4415
- if (is_neox) {
4416
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
4417
- const float inv_ndims = -1.0f / n_dims;
4418
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4419
- (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4420
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
4421
- src2 != nullptr,
4422
- });
4423
- } else {
4424
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4425
- (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4426
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4427
- });
4428
- }
4462
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
4463
+
4464
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4465
+ (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4466
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
4467
+ src2 != nullptr,
4468
+ });
4429
4469
  }
4430
4470
 
4431
4471
  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -5569,6 +5609,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5569
5609
  const ggml_tensor * src2 = node->src[2];
5570
5610
 
5571
5611
  switch (node->op) {
5612
+ // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
5613
+ case GGML_OP_RESHAPE:
5614
+ case GGML_OP_VIEW:
5615
+ case GGML_OP_PERMUTE:
5616
+ case GGML_OP_TRANSPOSE:
5617
+ case GGML_OP_NONE:
5618
+ return;
5572
5619
  case GGML_OP_UNARY:
5573
5620
  switch (ggml_get_unary_op(node)) {
5574
5621
  case GGML_UNARY_OP_SILU:
@@ -5590,10 +5637,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5590
5637
  case GGML_OP_CPY:
5591
5638
  case GGML_OP_CONT:
5592
5639
  case GGML_OP_DUP:
5593
- case GGML_OP_RESHAPE:
5594
- case GGML_OP_VIEW:
5595
- case GGML_OP_PERMUTE:
5596
- case GGML_OP_TRANSPOSE:
5597
5640
  case GGML_OP_NORM:
5598
5641
  case GGML_OP_RMS_NORM:
5599
5642
  case GGML_OP_DIAG_MASK_INF:
@@ -5601,7 +5644,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5601
5644
  case GGML_OP_ROPE:
5602
5645
  case GGML_OP_MUL_MAT:
5603
5646
  case GGML_OP_MUL_MAT_ID:
5604
- case GGML_OP_NONE:
5605
5647
  case GGML_OP_ARGSORT:
5606
5648
  case GGML_OP_SUM_ROWS:
5607
5649
  break;
@@ -5654,12 +5696,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5654
5696
  case GGML_OP_DUP:
5655
5697
  ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
5656
5698
 
5657
- break;
5658
- case GGML_OP_RESHAPE:
5659
- case GGML_OP_VIEW:
5660
- case GGML_OP_PERMUTE:
5661
- case GGML_OP_TRANSPOSE:
5662
- case GGML_OP_NONE:
5663
5699
  break;
5664
5700
  case GGML_OP_NORM:
5665
5701
  ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@@ -5712,7 +5748,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5712
5748
  return;
5713
5749
  }
5714
5750
 
5715
- extra->ready = true;
5716
5751
  extra->ctx_idx = ctx->compute_ctx->idx;
5717
5752
 
5718
5753
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5796,8 +5831,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5796
5831
  ggml_vk_check_results_0(ctx, params, tensor);
5797
5832
  #endif
5798
5833
 
5799
- GGML_ASSERT(extra->ready);
5800
-
5801
5834
  vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
5802
5835
 
5803
5836
  // Only run if ctx hasn't been submitted yet
@@ -5822,8 +5855,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5822
5855
  subctx.out_memcpys.clear();
5823
5856
  }
5824
5857
 
5825
- extra->ready = false;
5826
-
5827
5858
  return true;
5828
5859
  }
5829
5860
 
@@ -5943,7 +5974,9 @@ struct ggml_backend_vk_buffer_context {
5943
5974
 
5944
5975
  ~ggml_backend_vk_buffer_context() {
5945
5976
  ggml_vk_destroy_buffer(dev_buffer);
5946
- delete[] temp_tensor_extras;
5977
+ if (temp_tensor_extras != nullptr) {
5978
+ delete[] temp_tensor_extras;
5979
+ }
5947
5980
  }
5948
5981
 
5949
5982
  ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
@@ -5990,18 +6023,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
5990
6023
  #endif
5991
6024
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5992
6025
 
5993
- ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
5994
- if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
6026
+ if (tensor->view_src != nullptr) {
5995
6027
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
5996
- ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra;
5997
- extra->buffer_gpu = extra_view->buffer_gpu;
5998
- extra->offset = extra_view->offset + tensor->view_offs;
6028
+ GGML_ASSERT(tensor->view_src->extra != nullptr);
6029
+ tensor->extra = tensor->view_src->extra;
5999
6030
  } else {
6031
+ ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
6000
6032
  extra->buffer_gpu = ctx->dev_buffer;
6001
6033
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
6034
+ tensor->extra = extra;
6002
6035
  }
6003
-
6004
- tensor->extra = extra;
6005
6036
  }
6006
6037
 
6007
6038
  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -6014,7 +6045,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
6014
6045
 
6015
6046
  vk_buffer buf = extra->buffer_gpu.lock();
6016
6047
 
6017
- ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
6048
+ ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6018
6049
  }
6019
6050
 
6020
6051
  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6027,7 +6058,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
6027
6058
 
6028
6059
  vk_buffer buf = extra->buffer_gpu.lock();
6029
6060
 
6030
- ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
6061
+ ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6031
6062
  }
6032
6063
 
6033
6064
  GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6038,7 +6069,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
6038
6069
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6039
6070
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6040
6071
 
6041
- ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
6072
+ ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6042
6073
 
6043
6074
  return true;
6044
6075
  }
@@ -6082,7 +6113,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
6082
6113
  std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6083
6114
  #endif
6084
6115
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6085
- vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
6116
+
6117
+ vk_buffer dev_buffer = nullptr;
6118
+ try {
6119
+ dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
6120
+ } catch (const vk::SystemError& e) {
6121
+ return nullptr;
6122
+ }
6086
6123
 
6087
6124
  ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
6088
6125
 
@@ -6105,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
6105
6142
  UNUSED(buft);
6106
6143
  }
6107
6144
 
6108
- GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
6109
- if (!ggml_backend_is_vk(backend)) {
6110
- return false;
6111
- }
6112
-
6113
- ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6114
- ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6115
-
6116
- return buft_ctx->ctx->idx == ctx->idx;
6117
- }
6118
-
6119
6145
  static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6120
6146
  /* .get_name = */ ggml_backend_vk_buffer_type_name,
6121
6147
  /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
6122
6148
  /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
6123
6149
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
6124
6150
  /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
6125
- /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
6126
6151
  /* .is_host = */ NULL,
6127
6152
  };
6128
6153
 
@@ -6198,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6198
6223
  /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
6199
6224
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
6200
6225
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
6201
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
6202
6226
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
6203
6227
  },
6204
6228
  /* .context = */ nullptr,
@@ -6264,7 +6288,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6264
6288
 
6265
6289
  vk_buffer buf = extra->buffer_gpu.lock();
6266
6290
 
6267
- ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
6291
+ ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6268
6292
  }
6269
6293
 
6270
6294
  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6284,7 +6308,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6284
6308
 
6285
6309
  vk_buffer buf = extra->buffer_gpu.lock();
6286
6310
 
6287
- ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
6311
+ ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
6288
6312
  }
6289
6313
 
6290
6314
  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6305,7 +6329,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6305
6329
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
6306
6330
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
6307
6331
 
6308
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
6332
+ ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
6309
6333
  return true;
6310
6334
  }
6311
6335
 
@@ -6402,7 +6426,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6402
6426
  case GGML_UNARY_OP_GELU:
6403
6427
  case GGML_UNARY_OP_SILU:
6404
6428
  case GGML_UNARY_OP_RELU:
6405
- return true;
6429
+ return ggml_is_contiguous(op->src[0]);
6406
6430
  default:
6407
6431
  return false;
6408
6432
  }
@@ -6478,11 +6502,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6478
6502
  // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
6479
6503
  // } break;
6480
6504
  case GGML_OP_ROPE:
6481
- {
6482
- const int mode = ((const int32_t *) op->op_params)[2];
6483
-
6484
- return true;
6485
- } break;
6505
+ return ggml_is_contiguous(op->src[0]);
6486
6506
  case GGML_OP_NONE:
6487
6507
  case GGML_OP_RESHAPE:
6488
6508
  case GGML_OP_VIEW:
@@ -6518,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
6518
6538
  UNUSED(backend);
6519
6539
  }
6520
6540
 
6541
+ GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
6542
+ if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
6543
+ return false;
6544
+ }
6545
+
6546
+ ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6547
+ ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6548
+
6549
+ return buft_ctx->ctx->idx == ctx->idx;
6550
+ }
6551
+
6521
6552
  // TODO: enable async and synchronize
6522
6553
  static ggml_backend_i ggml_backend_vk_interface = {
6523
6554
  /* .get_name = */ ggml_backend_vk_name,
@@ -6529,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
6529
6560
  /* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
6530
6561
  /* .graph_plan_create = */ NULL,
6531
6562
  /* .graph_plan_free = */ NULL,
6563
+ /* .graph_plan_update = */ NULL,
6532
6564
  /* .graph_plan_compute = */ NULL,
6533
6565
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
6534
6566
  /* .supports_op = */ ggml_backend_vk_supports_op,
6567
+ /* .supports_buft = */ ggml_backend_vk_supports_buft,
6535
6568
  /* .offload_op = */ ggml_backend_vk_offload_op,
6536
6569
  /* .event_new = */ NULL,
6537
6570
  /* .event_free = */ NULL,
@@ -6725,7 +6758,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6725
6758
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6726
6759
 
6727
6760
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6728
- ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
6761
+ ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
6729
6762
  }
6730
6763
 
6731
6764
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@@ -6809,7 +6842,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6809
6842
  } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6810
6843
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6811
6844
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6812
- uint64_t offset = extra->offset;
6845
+ uint64_t offset = extra->offset + src0->view_offs;
6813
6846
  if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
6814
6847
  for (int i3 = 0; i3 < src0->ne[3]; i3++) {
6815
6848
  for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@@ -6851,7 +6884,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6851
6884
  } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6852
6885
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6853
6886
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6854
- uint64_t offset = extra->offset;
6887
+ uint64_t offset = extra->offset + src1->view_offs;
6855
6888
  if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
6856
6889
  for (int i3 = 0; i3 < src1->ne[3]; i3++) {
6857
6890
  for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@@ -6909,7 +6942,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6909
6942
  } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6910
6943
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6911
6944
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6912
- uint64_t offset = extra->offset;
6945
+ uint64_t offset = extra->offset + src2->view_offs;
6913
6946
  if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6914
6947
  for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6915
6948
  for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@@ -7092,11 +7125,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7092
7125
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7093
7126
 
7094
7127
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
7095
- if (extra->offset + tensor_size >= buffer_gpu->size) {
7096
- tensor_size = buffer_gpu->size - (extra->offset);
7128
+ if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
7129
+ tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
7097
7130
  }
7098
7131
 
7099
- ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
7132
+ ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
7100
7133
  }
7101
7134
 
7102
7135
  float first_error_result = -1.0f;