llama_cpp 0.16.0 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -148,6 +148,8 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
148
148
  float * dst_d = (float *)dst->data;
149
149
  cudaStream_t stream = ctx.stream();
150
150
 
151
+ GGML_ASSERT(ggml_is_contiguous(src0));
152
+
151
153
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
152
154
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
153
155
 
@@ -160,6 +162,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
160
162
  float * dst_d = (float *)dst->data;
161
163
  cudaStream_t stream = ctx.stream();
162
164
 
165
+ GGML_ASSERT(ggml_is_contiguous(src0));
166
+
163
167
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
164
168
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
165
169
 
@@ -172,6 +176,8 @@ void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
172
176
  float * dst_d = (float *)dst->data;
173
177
  cudaStream_t stream = ctx.stream();
174
178
 
179
+ GGML_ASSERT(ggml_is_contiguous(src0));
180
+
175
181
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
176
182
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
177
183
 
@@ -184,6 +190,8 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
184
190
  float * dst_d = (float *)dst->data;
185
191
  cudaStream_t stream = ctx.stream();
186
192
 
193
+ GGML_ASSERT(ggml_is_contiguous(src0));
194
+
187
195
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
188
196
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
189
197
 
@@ -196,6 +204,8 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
196
204
  float * dst_d = (float *)dst->data;
197
205
  cudaStream_t stream = ctx.stream();
198
206
 
207
+ GGML_ASSERT(ggml_is_contiguous(src0));
208
+
199
209
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
200
210
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
201
211
 
@@ -208,6 +218,8 @@ void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
208
218
  float * dst_d = (float *)dst->data;
209
219
  cudaStream_t stream = ctx.stream();
210
220
 
221
+ GGML_ASSERT(ggml_is_contiguous(src0));
222
+
211
223
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
212
224
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
213
225
 
@@ -220,6 +232,8 @@ void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst
220
232
  float * dst_d = (float *)dst->data;
221
233
  cudaStream_t stream = ctx.stream();
222
234
 
235
+ GGML_ASSERT(ggml_is_contiguous(src0));
236
+
223
237
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
224
238
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
225
239
 
@@ -232,6 +246,8 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
232
246
  float * dst_d = (float *)dst->data;
233
247
  cudaStream_t stream = ctx.stream();
234
248
 
249
+ GGML_ASSERT(ggml_is_contiguous(src0));
250
+
235
251
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
236
252
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
237
253
 
@@ -244,6 +260,8 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
244
260
  float * dst_d = (float *)dst->data;
245
261
  cudaStream_t stream = ctx.stream();
246
262
 
263
+ GGML_ASSERT(ggml_is_contiguous(src0));
264
+
247
265
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
248
266
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
249
267
 
@@ -259,6 +277,8 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
259
277
  float * dst_d = (float *)dst->data;
260
278
  cudaStream_t stream = ctx.stream();
261
279
 
280
+ GGML_ASSERT(ggml_is_contiguous(src0));
281
+
262
282
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
263
283
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
264
284
 
@@ -188,13 +188,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
188
188
  info.default_tensor_split[id] = total_vram;
189
189
  total_vram += prop.totalGlobalMem;
190
190
 
191
+ info.devices[id].nsm = prop.multiProcessorCount;
192
+ info.devices[id].smpb = prop.sharedMemPerBlock;
191
193
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
194
+ info.devices[id].smpbo = prop.sharedMemPerBlock;
192
195
  info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
193
196
  #else
197
+ info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
194
198
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
195
199
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
196
- info.devices[id].smpb = prop.sharedMemPerBlock;
197
- info.devices[id].nsm = prop.multiProcessorCount;
198
200
  }
199
201
 
200
202
  for (int id = 0; id < info.device_count; ++id) {
@@ -543,6 +545,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
543
545
  return ctx->name.c_str();
544
546
  }
545
547
 
548
+ static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
549
+ return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
550
+ }
551
+
546
552
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
547
553
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
548
554
 
@@ -585,24 +591,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
585
591
  GGML_UNUSED(buft);
586
592
  }
587
593
 
588
- GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
589
- if (!ggml_backend_is_cuda(backend)) {
590
- return false;
591
- }
592
-
593
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
594
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
595
-
596
- return buft_ctx->device == cuda_ctx->device;
597
- }
598
-
599
594
  static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
600
595
  /* .get_name = */ ggml_backend_cuda_buffer_type_name,
601
596
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
602
597
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
603
598
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
604
599
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
605
- /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
606
600
  /* .is_host = */ NULL,
607
601
  };
608
602
 
@@ -633,88 +627,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
633
627
 
634
628
  // cuda split buffer
635
629
 
636
- static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
637
- int64_t min_compute_capability = INT_MAX;
638
- int64_t max_compute_capability = INT_MIN;
630
+ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
631
+ int64_t row_rounding = 0;
639
632
  for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
640
- if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
641
- if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
642
- min_compute_capability = ggml_cuda_info().devices[id].cc;
643
- }
644
- if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
645
- max_compute_capability = ggml_cuda_info().devices[id].cc;
646
- }
633
+ if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
634
+ continue;
647
635
  }
648
- }
649
636
 
650
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
651
- switch(type) {
652
- case GGML_TYPE_Q4_0:
653
- case GGML_TYPE_Q4_1:
654
- case GGML_TYPE_Q5_0:
655
- case GGML_TYPE_Q5_1:
656
- case GGML_TYPE_Q8_0:
657
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
658
- case GGML_TYPE_F16:
659
- case GGML_TYPE_F32:
660
- return 1;
661
- case GGML_TYPE_Q2_K:
662
- return max_compute_capability >= CC_RDNA2 ? 128 : 32;
663
- case GGML_TYPE_Q3_K:
664
- return min_compute_capability < CC_RDNA2 ? 128 : 64;
665
- case GGML_TYPE_Q4_K:
666
- case GGML_TYPE_Q5_K:
667
- case GGML_TYPE_Q6_K:
668
- case GGML_TYPE_IQ2_XXS:
669
- case GGML_TYPE_IQ2_XS:
670
- case GGML_TYPE_IQ2_S:
671
- case GGML_TYPE_IQ3_XXS:
672
- case GGML_TYPE_IQ1_S:
673
- case GGML_TYPE_IQ1_M:
674
- case GGML_TYPE_IQ4_NL:
675
- case GGML_TYPE_IQ4_XS:
676
- case GGML_TYPE_IQ3_S:
677
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
678
- default:
679
- GGML_ASSERT(false);
637
+ const int cc = ggml_cuda_info().devices[id].cc;
638
+ row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
680
639
  }
681
- #else
682
- switch(type) {
683
- case GGML_TYPE_Q4_0:
684
- case GGML_TYPE_Q4_1:
685
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
686
- case GGML_TYPE_Q5_0:
687
- case GGML_TYPE_Q5_1:
688
- case GGML_TYPE_Q8_0:
689
- return 64;
690
- case GGML_TYPE_F16:
691
- case GGML_TYPE_F32:
692
- return 1;
693
- case GGML_TYPE_Q2_K:
694
- case GGML_TYPE_Q3_K:
695
- case GGML_TYPE_Q4_K:
696
- case GGML_TYPE_Q5_K:
697
- case GGML_TYPE_IQ2_XXS:
698
- case GGML_TYPE_IQ2_XS:
699
- case GGML_TYPE_IQ2_S:
700
- case GGML_TYPE_IQ3_XXS:
701
- case GGML_TYPE_IQ1_S:
702
- case GGML_TYPE_IQ1_M:
703
- case GGML_TYPE_IQ4_NL:
704
- case GGML_TYPE_IQ4_XS:
705
- case GGML_TYPE_IQ3_S:
706
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
707
- case GGML_TYPE_Q6_K:
708
- return 64;
709
- default:
710
- GGML_ASSERT(false);
711
- }
712
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
640
+ return row_rounding;
713
641
  }
714
642
 
715
643
  static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
716
644
  const int64_t nrows = ggml_nrows(tensor);
717
- const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
645
+ const int64_t rounding = get_row_rounding(tensor_split);
718
646
 
719
647
  *row_low = id == 0 ? 0 : nrows*tensor_split[id];
720
648
  *row_low -= *row_low % rounding;
@@ -929,6 +857,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
929
857
  GGML_UNUSED(buft);
930
858
  }
931
859
 
860
+ static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
861
+ return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
862
+ }
863
+
932
864
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
933
865
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
934
866
  // instead, we allocate them for each tensor separately in init_tensor
@@ -972,12 +904,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
972
904
  return total_size;
973
905
  }
974
906
 
975
- GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
976
- return ggml_backend_is_cuda(backend);
977
-
978
- GGML_UNUSED(buft);
979
- }
980
-
981
907
  GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
982
908
  return false;
983
909
 
@@ -990,7 +916,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
990
916
  /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
991
917
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
992
918
  /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
993
- /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
994
919
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
995
920
  };
996
921
 
@@ -1090,7 +1015,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1090
1015
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1091
1016
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1092
1017
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1093
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
1094
1018
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1095
1019
  },
1096
1020
  /* .context = */ nullptr,
@@ -1413,10 +1337,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1413
1337
  GGML_UNUSED(main_device);
1414
1338
  }
1415
1339
 
1340
+ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
1341
+ void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
1342
+
1343
+ #if !defined(GGML_USE_HIPBLAS)
1344
+ // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1345
+ cudaMemcpy3DPeerParms p = {};
1346
+ p.dstDevice = dstDevice;
1347
+ p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
1348
+ p.srcDevice = srcDevice;
1349
+ p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
1350
+ p.extent = make_cudaExtent(width, height, 1);
1351
+ return cudaMemcpy3DPeerAsync(&p, stream);
1352
+ #else
1353
+ // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1354
+ GGML_UNUSED(dstDevice);
1355
+ GGML_UNUSED(srcDevice);
1356
+ return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
1357
+ #endif // !defined(GGML_USE_HIPBLAS)
1358
+ }
1359
+
1416
1360
  static void ggml_cuda_op_mul_mat(
1417
1361
  ggml_backend_cuda_context & ctx,
1418
1362
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
1419
- const bool convert_src1_to_q8_1) {
1363
+ quantize_cuda_t quantize_src1) {
1420
1364
 
1421
1365
  const int64_t ne00 = src0->ne[0];
1422
1366
  const int64_t ne01 = src0->ne[1];
@@ -1473,7 +1417,9 @@ static void ggml_cuda_op_mul_mat(
1473
1417
  }
1474
1418
 
1475
1419
  struct dev_data {
1476
- ggml_cuda_pool_alloc<char> src0_dd_alloc;
1420
+ int cc;
1421
+
1422
+ ggml_cuda_pool_alloc<char> src0_dd_alloc;
1477
1423
  ggml_cuda_pool_alloc<float> src1_ddf_alloc;
1478
1424
  ggml_cuda_pool_alloc<char> src1_ddq_alloc;
1479
1425
  ggml_cuda_pool_alloc<float> dst_dd_alloc;
@@ -1492,6 +1438,8 @@ static void ggml_cuda_op_mul_mat(
1492
1438
  int used_devices = 0;
1493
1439
 
1494
1440
  for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1441
+ dev[id].cc = ggml_cuda_info().devices[id].cc;
1442
+
1495
1443
  // by default, use all rows
1496
1444
  dev[id].row_low = 0;
1497
1445
  dev[id].row_high = ne01;
@@ -1499,7 +1447,7 @@ static void ggml_cuda_op_mul_mat(
1499
1447
  // for multi GPU, get the row boundaries from tensor split
1500
1448
  // and round to mul_mat_q tile sizes
1501
1449
  if (split) {
1502
- const int64_t rounding = get_row_rounding(src0->type, tensor_split);
1450
+ const int64_t rounding = get_row_rounding(tensor_split);
1503
1451
 
1504
1452
  if (id != 0) {
1505
1453
  dev[id].row_low = ne01*tensor_split[id];
@@ -1542,11 +1490,15 @@ static void ggml_cuda_op_mul_mat(
1542
1490
  dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
1543
1491
  }
1544
1492
 
1545
- if (convert_src1_to_q8_1) {
1546
- dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
1493
+ if (quantize_src1) {
1494
+ size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
1495
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1496
+ src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
1497
+ }
1498
+ dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
1547
1499
 
1548
1500
  if (src1_on_device && src1_is_contiguous) {
1549
- quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
1501
+ quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
1550
1502
  CUDA_CHECK(cudaGetLastError());
1551
1503
  }
1552
1504
  }
@@ -1592,7 +1544,12 @@ static void ggml_cuda_op_mul_mat(
1592
1544
  const int64_t i03 = i0 / ne12;
1593
1545
  const int64_t i02 = i0 % ne12;
1594
1546
 
1595
- const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
1547
+ size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1548
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1549
+ src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
1550
+ } else {
1551
+ src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1552
+ }
1596
1553
 
1597
1554
  // for split tensors the data begins at i0 == i0_offset_low
1598
1555
  char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
@@ -1609,10 +1566,17 @@ static void ggml_cuda_op_mul_mat(
1609
1566
  // copy src0, src1 to device if necessary
1610
1567
  if (src1_is_contiguous) {
1611
1568
  if (id != ctx.device) {
1612
- if (convert_src1_to_q8_1) {
1569
+ if (quantize_src1) {
1613
1570
  char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
1614
- CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
1615
- src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1571
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1572
+ const size_t pitch = ne11*sizeof(block_q8_1_mmq);
1573
+ const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
1574
+ const size_t height = src1_padded_col_size/(4*QK8_1);
1575
+ CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
1576
+ } else {
1577
+ CUDA_CHECK(cudaMemcpyPeerAsync(
1578
+ src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1579
+ }
1616
1580
  } else {
1617
1581
  float * src1_ddf_i_source = (float *) src1->data;
1618
1582
  src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
@@ -1627,8 +1591,8 @@ static void ggml_cuda_op_mul_mat(
1627
1591
  GGML_ASSERT(false);
1628
1592
  }
1629
1593
 
1630
- if (convert_src1_to_q8_1 && !src1_is_contiguous) {
1631
- quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
1594
+ if (quantize_src1 && !src1_is_contiguous) {
1595
+ quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
1632
1596
  CUDA_CHECK(cudaGetLastError());
1633
1597
  }
1634
1598
 
@@ -1653,22 +1617,8 @@ static void ggml_cuda_op_mul_mat(
1653
1617
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1654
1618
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1655
1619
  dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
1656
- #if !defined(GGML_USE_HIPBLAS)
1657
- // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1658
- cudaMemcpy3DPeerParms p = {};
1659
- p.dstDevice = ctx.device;
1660
- p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
1661
- p.srcDevice = id;
1662
- p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
1663
- p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
1664
- CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
1665
- #else
1666
- // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1667
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
1668
- dst_dd_i, row_diff*sizeof(float),
1669
- row_diff*sizeof(float), src1_ncols,
1670
- cudaMemcpyDeviceToDevice, stream));
1671
- #endif
1620
+ CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
1621
+ dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
1672
1622
  } else {
1673
1623
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1674
1624
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
@@ -2007,13 +1957,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2007
1957
  // KQ + KQV multi-batch
2008
1958
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
2009
1959
  } else if (use_dequantize_mul_mat_vec) {
2010
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
1960
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
2011
1961
  } else if (use_mul_mat_vec_q) {
2012
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
1962
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
2013
1963
  } else if (use_mul_mat_q) {
2014
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
1964
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
2015
1965
  } else {
2016
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
1966
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
2017
1967
  }
2018
1968
  }
2019
1969
 
@@ -2780,7 +2730,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2780
2730
  case GGML_UNARY_OP_HARDSWISH:
2781
2731
  case GGML_UNARY_OP_GELU_QUICK:
2782
2732
  case GGML_UNARY_OP_TANH:
2783
- return true;
2733
+ return ggml_is_contiguous(op->src[0]);
2784
2734
  default:
2785
2735
  return false;
2786
2736
  }
@@ -2919,6 +2869,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2919
2869
  GGML_UNUSED(backend);
2920
2870
  }
2921
2871
 
2872
+ GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
2873
+ if (ggml_backend_buft_is_cuda_split(buft)) {
2874
+ return true;
2875
+ }
2876
+
2877
+ if (ggml_backend_buft_is_cuda(buft)) {
2878
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2879
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
2880
+ return buft_ctx->device == cuda_ctx->device;
2881
+ }
2882
+
2883
+ return false;
2884
+ }
2885
+
2922
2886
  GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2923
2887
  const int min_batch_size = 32;
2924
2888
 
@@ -2991,9 +2955,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
2991
2955
  /* .synchronize = */ ggml_backend_cuda_synchronize,
2992
2956
  /* .graph_plan_create = */ NULL,
2993
2957
  /* .graph_plan_free = */ NULL,
2958
+ /* .graph_plan_update = */ NULL,
2994
2959
  /* .graph_plan_compute = */ NULL,
2995
2960
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2996
2961
  /* .supports_op = */ ggml_backend_cuda_supports_op,
2962
+ /* .supports_buft = */ ggml_backend_cuda_supports_buft,
2997
2963
  /* .offload_op = */ ggml_backend_cuda_offload_op,
2998
2964
  /* .event_new = */ ggml_backend_cuda_event_new,
2999
2965
  /* .event_free = */ ggml_backend_cuda_event_free,
@@ -1340,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
1340
1340
  case GGML_UNARY_OP_RELU:
1341
1341
  case GGML_UNARY_OP_GELU:
1342
1342
  case GGML_UNARY_OP_SILU:
1343
- return true;
1343
+ return ggml_is_contiguous(op->src[0]);
1344
1344
  default:
1345
1345
  ;
1346
1346
  }
@@ -1902,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
1902
1902
  return ctx->max_alloc;
1903
1903
  }
1904
1904
 
1905
- static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
1906
- GGML_UNUSED(buft);
1907
- return ggml_backend_is_kompute(backend);
1908
- }
1909
-
1910
1905
  static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
1911
1906
  /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
1912
1907
  /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
1913
1908
  /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
1914
1909
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
1915
1910
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1916
- /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
1917
1911
  /* .is_host = */ NULL,
1918
1912
  };
1919
1913
 
@@ -1973,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
1973
1967
  return ggml_vk_supports_op(op);
1974
1968
  }
1975
1969
 
1970
+ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
1971
+ GGML_UNUSED(backend);
1972
+ return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
1973
+ }
1974
+
1976
1975
  static struct ggml_backend_i kompute_backend_i = {
1977
1976
  /* .get_name = */ ggml_backend_kompute_name,
1978
1977
  /* .free = */ ggml_backend_kompute_free,
@@ -1983,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
1983
1982
  /* .synchronize = */ NULL,
1984
1983
  /* .graph_plan_create = */ NULL,
1985
1984
  /* .graph_plan_free = */ NULL,
1985
+ /* .graph_plan_update = */ NULL,
1986
1986
  /* .graph_plan_compute = */ NULL,
1987
1987
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1988
1988
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1989
+ /* .supports_buft = */ ggml_backend_kompute_supports_buft,
1989
1990
  /* .offload_op = */ NULL,
1990
1991
  /* .event_new = */ NULL,
1991
1992
  /* .event_free = */ NULL,
@@ -744,7 +744,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
744
744
  case GGML_UNARY_OP_GELU:
745
745
  case GGML_UNARY_OP_GELU_QUICK:
746
746
  case GGML_UNARY_OP_SILU:
747
- return true;
747
+ return ggml_is_contiguous(op->src[0]);
748
748
  default:
749
749
  return false;
750
750
  }
@@ -1862,9 +1862,10 @@ static enum ggml_status ggml_metal_graph_compute(
1862
1862
  // ne21 = n_rows
1863
1863
  const int dst_rows = ne20*ne21;
1864
1864
  const int dst_rows_min = n_as;
1865
+ const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;
1865
1866
 
1866
1867
  // max size of the rowids array in the kernel shared buffer
1867
- GGML_ASSERT(dst_rows <= 2048);
1868
+ GGML_ASSERT(dst_rows <= dst_rows_max);
1868
1869
 
1869
1870
  // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
1870
1871
  // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
@@ -3044,12 +3045,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
3044
3045
  UNUSED(buft);
3045
3046
  }
3046
3047
 
3047
- GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
3048
- return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
3049
-
3050
- UNUSED(buft);
3051
- }
3052
-
3053
3048
  GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
3054
3049
  return true;
3055
3050
 
@@ -3064,7 +3059,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
3064
3059
  /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
3065
3060
  /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
3066
3061
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
3067
- /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
3068
3062
  /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
3069
3063
  },
3070
3064
  /* .context = */ NULL,
@@ -3179,6 +3173,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
3179
3173
  return ggml_metal_supports_op(metal_ctx, op);
3180
3174
  }
3181
3175
 
3176
+ GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
3177
+ return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
3178
+
3179
+ UNUSED(backend);
3180
+ }
3181
+
3182
3182
  static struct ggml_backend_i ggml_backend_metal_i = {
3183
3183
  /* .get_name = */ ggml_backend_metal_name,
3184
3184
  /* .free = */ ggml_backend_metal_free,
@@ -3189,9 +3189,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
3189
3189
  /* .synchronize = */ NULL,
3190
3190
  /* .graph_plan_create = */ NULL,
3191
3191
  /* .graph_plan_free = */ NULL,
3192
+ /* .graph_plan_update = */ NULL,
3192
3193
  /* .graph_plan_compute = */ NULL,
3193
3194
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
3194
3195
  /* .supports_op = */ ggml_backend_metal_supports_op,
3196
+ /* .supports_buft = */ ggml_backend_metal_supports_buft,
3195
3197
  /* .offload_op = */ NULL,
3196
3198
  /* .event_new = */ NULL,
3197
3199
  /* .event_free = */ NULL,
@@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend
540
540
  return ggml_nbytes(tensor);
541
541
  }
542
542
 
543
- GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
544
- if (!ggml_backend_is_rpc(backend)) {
545
- return false;
546
- }
547
- ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
548
- ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
549
- return buft_ctx->endpoint == rpc_ctx->endpoint;
550
- }
551
-
552
543
  static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
553
544
  /* .get_name = */ ggml_backend_rpc_buffer_type_name,
554
545
  /* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
555
546
  /* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
556
547
  /* .get_max_size = */ ggml_backend_rpc_get_max_size,
557
548
  /* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
558
- /* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
559
549
  /* .is_host = */ NULL,
560
550
  };
561
551
 
@@ -634,8 +624,17 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
634
624
  GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
635
625
  UNUSED(backend);
636
626
  UNUSED(op);
637
- GGML_ASSERT(false && "not implemented");
638
- return false;
627
+ //TODO: call the remote backend and cache the results
628
+ return true;
629
+ }
630
+
631
+ GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
632
+ if (buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
633
+ return false;
634
+ }
635
+ ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
636
+ ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
637
+ return buft_ctx->endpoint == rpc_ctx->endpoint;
639
638
  }
640
639
 
641
640
  static ggml_backend_i ggml_backend_rpc_interface = {
@@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
648
647
  /* .synchronize = */ ggml_backend_rpc_synchronize,
649
648
  /* .graph_plan_create = */ NULL,
650
649
  /* .graph_plan_free = */ NULL,
650
+ /* .graph_plan_update = */ NULL,
651
651
  /* .graph_plan_compute = */ NULL,
652
652
  /* .graph_compute = */ ggml_backend_rpc_graph_compute,
653
653
  /* .supports_op = */ ggml_backend_rpc_supports_op,
654
+ /* .supports_buft = */ ggml_backend_rpc_supports_buft,
654
655
  /* .offload_op = */ NULL,
655
656
  /* .event_new = */ NULL,
656
657
  /* .event_free = */ NULL,