llama_cpp 0.15.4 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (161) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +3 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +17 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +166 -82
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  141. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  142. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
  143. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  144. data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
  145. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
  146. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  147. data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
  148. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  149. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
  150. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
  151. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
  152. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
  153. data/vendor/tmp/llama.cpp/ggml.c +278 -603
  154. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  155. data/vendor/tmp/llama.cpp/llama.cpp +345 -473
  156. data/vendor/tmp/llama.cpp/llama.h +21 -43
  157. metadata +134 -7
  158. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  159. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  160. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  161. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -188,13 +188,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
188
188
  info.default_tensor_split[id] = total_vram;
189
189
  total_vram += prop.totalGlobalMem;
190
190
 
191
+ info.devices[id].nsm = prop.multiProcessorCount;
192
+ info.devices[id].smpb = prop.sharedMemPerBlock;
191
193
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
194
+ info.devices[id].smpbo = prop.sharedMemPerBlock;
192
195
  info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
193
196
  #else
197
+ info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
194
198
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
195
199
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
196
- info.devices[id].smpb = prop.sharedMemPerBlock;
197
- info.devices[id].nsm = prop.multiProcessorCount;
198
200
  }
199
201
 
200
202
  for (int id = 0; id < info.device_count; ++id) {
@@ -543,6 +545,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
543
545
  return ctx->name.c_str();
544
546
  }
545
547
 
548
+ static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
549
+ return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
550
+ }
551
+
546
552
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
547
553
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
548
554
 
@@ -585,24 +591,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
585
591
  GGML_UNUSED(buft);
586
592
  }
587
593
 
588
- GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
589
- if (!ggml_backend_is_cuda(backend)) {
590
- return false;
591
- }
592
-
593
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
594
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
595
-
596
- return buft_ctx->device == cuda_ctx->device;
597
- }
598
-
599
594
  static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
600
595
  /* .get_name = */ ggml_backend_cuda_buffer_type_name,
601
596
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
602
597
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
603
598
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
604
599
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
605
- /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
606
600
  /* .is_host = */ NULL,
607
601
  };
608
602
 
@@ -633,88 +627,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
633
627
 
634
628
  // cuda split buffer
635
629
 
636
- static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
637
- int64_t min_compute_capability = INT_MAX;
638
- int64_t max_compute_capability = INT_MIN;
630
+ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
631
+ int64_t row_rounding = 0;
639
632
  for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
640
- if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
641
- if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
642
- min_compute_capability = ggml_cuda_info().devices[id].cc;
643
- }
644
- if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
645
- max_compute_capability = ggml_cuda_info().devices[id].cc;
646
- }
633
+ if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
634
+ continue;
647
635
  }
648
- }
649
636
 
650
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
651
- switch(type) {
652
- case GGML_TYPE_Q4_0:
653
- case GGML_TYPE_Q4_1:
654
- case GGML_TYPE_Q5_0:
655
- case GGML_TYPE_Q5_1:
656
- case GGML_TYPE_Q8_0:
657
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
658
- case GGML_TYPE_F16:
659
- case GGML_TYPE_F32:
660
- return 1;
661
- case GGML_TYPE_Q2_K:
662
- return max_compute_capability >= CC_RDNA2 ? 128 : 32;
663
- case GGML_TYPE_Q3_K:
664
- return min_compute_capability < CC_RDNA2 ? 128 : 64;
665
- case GGML_TYPE_Q4_K:
666
- case GGML_TYPE_Q5_K:
667
- case GGML_TYPE_Q6_K:
668
- case GGML_TYPE_IQ2_XXS:
669
- case GGML_TYPE_IQ2_XS:
670
- case GGML_TYPE_IQ2_S:
671
- case GGML_TYPE_IQ3_XXS:
672
- case GGML_TYPE_IQ1_S:
673
- case GGML_TYPE_IQ1_M:
674
- case GGML_TYPE_IQ4_NL:
675
- case GGML_TYPE_IQ4_XS:
676
- case GGML_TYPE_IQ3_S:
677
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
678
- default:
679
- GGML_ASSERT(false);
680
- }
681
- #else
682
- switch(type) {
683
- case GGML_TYPE_Q4_0:
684
- case GGML_TYPE_Q4_1:
685
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
686
- case GGML_TYPE_Q5_0:
687
- case GGML_TYPE_Q5_1:
688
- case GGML_TYPE_Q8_0:
689
- return 64;
690
- case GGML_TYPE_F16:
691
- case GGML_TYPE_F32:
692
- return 1;
693
- case GGML_TYPE_Q2_K:
694
- case GGML_TYPE_Q3_K:
695
- case GGML_TYPE_Q4_K:
696
- case GGML_TYPE_Q5_K:
697
- case GGML_TYPE_IQ2_XXS:
698
- case GGML_TYPE_IQ2_XS:
699
- case GGML_TYPE_IQ2_S:
700
- case GGML_TYPE_IQ3_XXS:
701
- case GGML_TYPE_IQ1_S:
702
- case GGML_TYPE_IQ1_M:
703
- case GGML_TYPE_IQ4_NL:
704
- case GGML_TYPE_IQ4_XS:
705
- case GGML_TYPE_IQ3_S:
706
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
707
- case GGML_TYPE_Q6_K:
708
- return 64;
709
- default:
710
- GGML_ASSERT(false);
637
+ const int cc = ggml_cuda_info().devices[id].cc;
638
+ row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
711
639
  }
712
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
640
+ return row_rounding;
713
641
  }
714
642
 
715
643
  static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
716
644
  const int64_t nrows = ggml_nrows(tensor);
717
- const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
645
+ const int64_t rounding = get_row_rounding(tensor_split);
718
646
 
719
647
  *row_low = id == 0 ? 0 : nrows*tensor_split[id];
720
648
  *row_low -= *row_low % rounding;
@@ -929,6 +857,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
929
857
  GGML_UNUSED(buft);
930
858
  }
931
859
 
860
+ static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
861
+ return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
862
+ }
863
+
932
864
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
933
865
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
934
866
  // instead, we allocate them for each tensor separately in init_tensor
@@ -972,12 +904,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
972
904
  return total_size;
973
905
  }
974
906
 
975
- GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
976
- return ggml_backend_is_cuda(backend);
977
-
978
- GGML_UNUSED(buft);
979
- }
980
-
981
907
  GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
982
908
  return false;
983
909
 
@@ -990,7 +916,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
990
916
  /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
991
917
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
992
918
  /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
993
- /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
994
919
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
995
920
  };
996
921
 
@@ -1090,7 +1015,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1090
1015
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1091
1016
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1092
1017
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1093
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
1094
1018
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1095
1019
  },
1096
1020
  /* .context = */ nullptr,
@@ -1413,10 +1337,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1413
1337
  GGML_UNUSED(main_device);
1414
1338
  }
1415
1339
 
1340
+ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
1341
+ void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
1342
+
1343
+ #if !defined(GGML_USE_HIPBLAS)
1344
+ // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1345
+ cudaMemcpy3DPeerParms p = {};
1346
+ p.dstDevice = dstDevice;
1347
+ p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
1348
+ p.srcDevice = srcDevice;
1349
+ p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
1350
+ p.extent = make_cudaExtent(width, height, 1);
1351
+ return cudaMemcpy3DPeerAsync(&p, stream);
1352
+ #else
1353
+ // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1354
+ GGML_UNUSED(dstDevice);
1355
+ GGML_UNUSED(srcDevice);
1356
+ return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
1357
+ #endif // !defined(GGML_USE_HIPBLAS)
1358
+ }
1359
+
1416
1360
  static void ggml_cuda_op_mul_mat(
1417
1361
  ggml_backend_cuda_context & ctx,
1418
1362
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
1419
- const bool convert_src1_to_q8_1) {
1363
+ quantize_cuda_t quantize_src1) {
1420
1364
 
1421
1365
  const int64_t ne00 = src0->ne[0];
1422
1366
  const int64_t ne01 = src0->ne[1];
@@ -1473,7 +1417,9 @@ static void ggml_cuda_op_mul_mat(
1473
1417
  }
1474
1418
 
1475
1419
  struct dev_data {
1476
- ggml_cuda_pool_alloc<char> src0_dd_alloc;
1420
+ int cc;
1421
+
1422
+ ggml_cuda_pool_alloc<char> src0_dd_alloc;
1477
1423
  ggml_cuda_pool_alloc<float> src1_ddf_alloc;
1478
1424
  ggml_cuda_pool_alloc<char> src1_ddq_alloc;
1479
1425
  ggml_cuda_pool_alloc<float> dst_dd_alloc;
@@ -1492,6 +1438,8 @@ static void ggml_cuda_op_mul_mat(
1492
1438
  int used_devices = 0;
1493
1439
 
1494
1440
  for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1441
+ dev[id].cc = ggml_cuda_info().devices[id].cc;
1442
+
1495
1443
  // by default, use all rows
1496
1444
  dev[id].row_low = 0;
1497
1445
  dev[id].row_high = ne01;
@@ -1499,7 +1447,7 @@ static void ggml_cuda_op_mul_mat(
1499
1447
  // for multi GPU, get the row boundaries from tensor split
1500
1448
  // and round to mul_mat_q tile sizes
1501
1449
  if (split) {
1502
- const int64_t rounding = get_row_rounding(src0->type, tensor_split);
1450
+ const int64_t rounding = get_row_rounding(tensor_split);
1503
1451
 
1504
1452
  if (id != 0) {
1505
1453
  dev[id].row_low = ne01*tensor_split[id];
@@ -1542,11 +1490,15 @@ static void ggml_cuda_op_mul_mat(
1542
1490
  dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
1543
1491
  }
1544
1492
 
1545
- if (convert_src1_to_q8_1) {
1546
- dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
1493
+ if (quantize_src1) {
1494
+ size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
1495
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1496
+ src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
1497
+ }
1498
+ dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
1547
1499
 
1548
1500
  if (src1_on_device && src1_is_contiguous) {
1549
- quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
1501
+ quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
1550
1502
  CUDA_CHECK(cudaGetLastError());
1551
1503
  }
1552
1504
  }
@@ -1592,7 +1544,12 @@ static void ggml_cuda_op_mul_mat(
1592
1544
  const int64_t i03 = i0 / ne12;
1593
1545
  const int64_t i02 = i0 % ne12;
1594
1546
 
1595
- const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
1547
+ size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1548
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1549
+ src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
1550
+ } else {
1551
+ src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1552
+ }
1596
1553
 
1597
1554
  // for split tensors the data begins at i0 == i0_offset_low
1598
1555
  char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
@@ -1609,10 +1566,17 @@ static void ggml_cuda_op_mul_mat(
1609
1566
  // copy src0, src1 to device if necessary
1610
1567
  if (src1_is_contiguous) {
1611
1568
  if (id != ctx.device) {
1612
- if (convert_src1_to_q8_1) {
1569
+ if (quantize_src1) {
1613
1570
  char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
1614
- CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
1615
- src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1571
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1572
+ const size_t pitch = ne11*sizeof(block_q8_1_mmq);
1573
+ const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
1574
+ const size_t height = src1_padded_col_size/(4*QK8_1);
1575
+ CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
1576
+ } else {
1577
+ CUDA_CHECK(cudaMemcpyPeerAsync(
1578
+ src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1579
+ }
1616
1580
  } else {
1617
1581
  float * src1_ddf_i_source = (float *) src1->data;
1618
1582
  src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
@@ -1627,8 +1591,8 @@ static void ggml_cuda_op_mul_mat(
1627
1591
  GGML_ASSERT(false);
1628
1592
  }
1629
1593
 
1630
- if (convert_src1_to_q8_1 && !src1_is_contiguous) {
1631
- quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
1594
+ if (quantize_src1 && !src1_is_contiguous) {
1595
+ quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
1632
1596
  CUDA_CHECK(cudaGetLastError());
1633
1597
  }
1634
1598
 
@@ -1653,22 +1617,8 @@ static void ggml_cuda_op_mul_mat(
1653
1617
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1654
1618
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1655
1619
  dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
1656
- #if !defined(GGML_USE_HIPBLAS)
1657
- // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1658
- cudaMemcpy3DPeerParms p = {};
1659
- p.dstDevice = ctx.device;
1660
- p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
1661
- p.srcDevice = id;
1662
- p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
1663
- p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
1664
- CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
1665
- #else
1666
- // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1667
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
1668
- dst_dd_i, row_diff*sizeof(float),
1669
- row_diff*sizeof(float), src1_ncols,
1670
- cudaMemcpyDeviceToDevice, stream));
1671
- #endif
1620
+ CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
1621
+ dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
1672
1622
  } else {
1673
1623
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1674
1624
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
@@ -2007,13 +1957,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2007
1957
  // KQ + KQV multi-batch
2008
1958
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
2009
1959
  } else if (use_dequantize_mul_mat_vec) {
2010
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
1960
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
2011
1961
  } else if (use_mul_mat_vec_q) {
2012
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
1962
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
2013
1963
  } else if (use_mul_mat_q) {
2014
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
1964
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
2015
1965
  } else {
2016
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
1966
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
2017
1967
  }
2018
1968
  }
2019
1969
 
@@ -2702,10 +2652,8 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2702
2652
 
2703
2653
  if (cuda_graph_update_required) {
2704
2654
  // Extract nodes from graph
2705
- if (cuda_ctx->cuda_graph->num_nodes == 0) {
2706
- // First call with null argument gets number of nodes in graph
2707
- CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
2708
- }
2655
+ // First call with null argument gets number of nodes in graph
2656
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
2709
2657
  // Subsequent call with non-null argument gets nodes
2710
2658
  cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
2711
2659
  cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
@@ -2782,7 +2730,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2782
2730
  case GGML_UNARY_OP_HARDSWISH:
2783
2731
  case GGML_UNARY_OP_GELU_QUICK:
2784
2732
  case GGML_UNARY_OP_TANH:
2785
- return true;
2733
+ return ggml_is_contiguous(op->src[0]);
2786
2734
  default:
2787
2735
  return false;
2788
2736
  }
@@ -2905,10 +2853,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2905
2853
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2906
2854
  return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
2907
2855
  #else
2908
- if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
2856
+ if (op->src[0]->ne[0] == 128) {
2909
2857
  return true;
2910
2858
  }
2911
- return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
2859
+ if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
2860
+ return true;
2861
+ }
2862
+ return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
2863
+ op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
2912
2864
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2913
2865
  default:
2914
2866
  return false;
@@ -2917,6 +2869,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2917
2869
  GGML_UNUSED(backend);
2918
2870
  }
2919
2871
 
2872
+ GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
2873
+ if (ggml_backend_buft_is_cuda_split(buft)) {
2874
+ return true;
2875
+ }
2876
+
2877
+ if (ggml_backend_buft_is_cuda(buft)) {
2878
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2879
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
2880
+ return buft_ctx->device == cuda_ctx->device;
2881
+ }
2882
+
2883
+ return false;
2884
+ }
2885
+
2920
2886
  GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2921
2887
  const int min_batch_size = 32;
2922
2888
 
@@ -2989,9 +2955,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
2989
2955
  /* .synchronize = */ ggml_backend_cuda_synchronize,
2990
2956
  /* .graph_plan_create = */ NULL,
2991
2957
  /* .graph_plan_free = */ NULL,
2958
+ /* .graph_plan_update = */ NULL,
2992
2959
  /* .graph_plan_compute = */ NULL,
2993
2960
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2994
2961
  /* .supports_op = */ ggml_backend_cuda_supports_op,
2962
+ /* .supports_buft = */ ggml_backend_cuda_supports_buft,
2995
2963
  /* .offload_op = */ ggml_backend_cuda_offload_op,
2996
2964
  /* .event_new = */ ggml_backend_cuda_event_new,
2997
2965
  /* .event_free = */ ggml_backend_cuda_event_free,
@@ -22,6 +22,7 @@
22
22
  #include "shaderop_mul_mat_q4_1.h"
23
23
  #include "shaderop_mul_mat_q6_k.h"
24
24
  #include "shaderop_mul_mat_mat_f32.h"
25
+ #include "shaderop_getrows_f32.h"
25
26
  #include "shaderop_getrows_f16.h"
26
27
  #include "shaderop_getrows_q4_0.h"
27
28
  #include "shaderop_getrows_q4_1.h"
@@ -1146,6 +1147,14 @@ static void ggml_vk_get_rows(
1146
1147
  seq.record<kp::OpAlgoDispatch>(s_algo);
1147
1148
  }
1148
1149
 
1150
+ template <typename... Args>
1151
+ static void ggml_vk_get_rows_f32(Args&&... args) {
1152
+ const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv,
1153
+ kp::shader_data::op_getrows_f32_comp_spv_len);
1154
+
1155
+ ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
1156
+ }
1157
+
1149
1158
  template <typename... Args>
1150
1159
  static void ggml_vk_get_rows_f16(Args&&... args) {
1151
1160
  const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
@@ -1183,7 +1192,7 @@ static void ggml_vk_rope(
1183
1192
  const std::shared_ptr<kp::Tensor>& inB,
1184
1193
  const std::shared_ptr<kp::Tensor>& out,
1185
1194
  uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1186
- ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_orig_ctx,
1195
+ ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
1187
1196
  float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
1188
1197
  int32_t ne01, int32_t ne02, int32_t ne03,
1189
1198
  uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
@@ -1212,14 +1221,14 @@ static void ggml_vk_rope(
1212
1221
 
1213
1222
  struct PushConstants {
1214
1223
  uint32_t inAOff, inBOff, outOff;
1215
- int32_t n_dims, mode, n_orig_ctx;
1224
+ int32_t n_dims, mode, n_ctx_orig;
1216
1225
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1217
1226
  uint32_t nb00, nb01, nb02, nb03;
1218
1227
  int32_t ne0;
1219
1228
  uint32_t nb0, nb1, nb2, nb3;
1220
1229
  } pushConsts {
1221
1230
  safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
1222
- n_dims, mode, n_orig_ctx,
1231
+ n_dims, mode, n_ctx_orig,
1223
1232
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
1224
1233
  nb00, nb01, nb02, nb03,
1225
1234
  ne0,
@@ -1331,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
1331
1340
  case GGML_UNARY_OP_RELU:
1332
1341
  case GGML_UNARY_OP_GELU:
1333
1342
  case GGML_UNARY_OP_SILU:
1334
- return true;
1343
+ return ggml_is_contiguous(op->src[0]);
1335
1344
  default:
1336
1345
  ;
1337
1346
  }
@@ -1371,6 +1380,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
1371
1380
  return op->ne[3] == 1;
1372
1381
  case GGML_OP_GET_ROWS:
1373
1382
  switch (op->src[0]->type) {
1383
+ case GGML_TYPE_F32:
1374
1384
  case GGML_TYPE_F16:
1375
1385
  case GGML_TYPE_Q4_0:
1376
1386
  case GGML_TYPE_Q4_1:
@@ -1661,7 +1671,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1661
1671
  } break;
1662
1672
  case GGML_OP_GET_ROWS:
1663
1673
  {
1664
- if (src0t == GGML_TYPE_F16) {
1674
+ if (src0t == GGML_TYPE_F32) {
1675
+ ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1676
+ } else if (src0t == GGML_TYPE_F16) {
1665
1677
  ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1666
1678
  } else if (src0t == GGML_TYPE_Q4_0) {
1667
1679
  ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
@@ -1680,13 +1692,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1680
1692
  #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1681
1693
  GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1682
1694
 
1695
+ #pragma message("TODO: update rope NORM mode to match NEOX mode")
1696
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
1697
+
1683
1698
  GGML_ASSERT(ne10 == ne02);
1684
1699
  GGML_ASSERT(src0t == dstt);
1685
1700
  // const int n_past = ((int32_t *) dst->op_params)[0];
1686
1701
  const int n_dims = ((int32_t *) dst->op_params)[1];
1687
1702
  const int mode = ((int32_t *) dst->op_params)[2];
1688
1703
  // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
1689
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
1704
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
1690
1705
 
1691
1706
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1692
1707
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
@@ -1696,7 +1711,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1696
1711
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1697
1712
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1698
1713
  ggml_vk_rope(
1699
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_orig_ctx,
1714
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
1700
1715
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
1701
1716
  ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
1702
1717
  );
@@ -1887,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
1887
1902
  return ctx->max_alloc;
1888
1903
  }
1889
1904
 
1890
- static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
1891
- GGML_UNUSED(buft);
1892
- return ggml_backend_is_kompute(backend);
1893
- }
1894
-
1895
1905
  static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
1896
1906
  /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
1897
1907
  /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
1898
1908
  /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
1899
1909
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
1900
1910
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1901
- /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
1902
1911
  /* .is_host = */ NULL,
1903
1912
  };
1904
1913
 
@@ -1958,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
1958
1967
  return ggml_vk_supports_op(op);
1959
1968
  }
1960
1969
 
1970
+ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
1971
+ GGML_UNUSED(backend);
1972
+ return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
1973
+ }
1974
+
1961
1975
  static struct ggml_backend_i kompute_backend_i = {
1962
1976
  /* .get_name = */ ggml_backend_kompute_name,
1963
1977
  /* .free = */ ggml_backend_kompute_free,
@@ -1968,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
1968
1982
  /* .synchronize = */ NULL,
1969
1983
  /* .graph_plan_create = */ NULL,
1970
1984
  /* .graph_plan_free = */ NULL,
1985
+ /* .graph_plan_update = */ NULL,
1971
1986
  /* .graph_plan_compute = */ NULL,
1972
1987
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1973
1988
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1989
+ /* .supports_buft = */ ggml_backend_kompute_supports_buft,
1974
1990
  /* .offload_op = */ NULL,
1975
1991
  /* .event_new = */ NULL,
1976
1992
  /* .event_free = */ NULL,
@@ -1,7 +1,7 @@
1
1
  // An interface allowing to compute ggml_cgraph with Metal
2
2
  //
3
3
  // This is a fully functional interface that extends ggml with GPU support for Apple devices.
4
- // A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
4
+ // A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
5
5
  //
6
6
  // How it works?
7
7
  //