llama_cpp 0.16.0 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -148,6 +148,8 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
148
148
  float * dst_d = (float *)dst->data;
149
149
  cudaStream_t stream = ctx.stream();
150
150
 
151
+ GGML_ASSERT(ggml_is_contiguous(src0));
152
+
151
153
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
152
154
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
153
155
 
@@ -160,6 +162,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
160
162
  float * dst_d = (float *)dst->data;
161
163
  cudaStream_t stream = ctx.stream();
162
164
 
165
+ GGML_ASSERT(ggml_is_contiguous(src0));
166
+
163
167
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
164
168
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
165
169
 
@@ -172,6 +176,8 @@ void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
172
176
  float * dst_d = (float *)dst->data;
173
177
  cudaStream_t stream = ctx.stream();
174
178
 
179
+ GGML_ASSERT(ggml_is_contiguous(src0));
180
+
175
181
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
176
182
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
177
183
 
@@ -184,6 +190,8 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
184
190
  float * dst_d = (float *)dst->data;
185
191
  cudaStream_t stream = ctx.stream();
186
192
 
193
+ GGML_ASSERT(ggml_is_contiguous(src0));
194
+
187
195
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
188
196
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
189
197
 
@@ -196,6 +204,8 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
196
204
  float * dst_d = (float *)dst->data;
197
205
  cudaStream_t stream = ctx.stream();
198
206
 
207
+ GGML_ASSERT(ggml_is_contiguous(src0));
208
+
199
209
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
200
210
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
201
211
 
@@ -208,6 +218,8 @@ void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
208
218
  float * dst_d = (float *)dst->data;
209
219
  cudaStream_t stream = ctx.stream();
210
220
 
221
+ GGML_ASSERT(ggml_is_contiguous(src0));
222
+
211
223
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
212
224
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
213
225
 
@@ -220,6 +232,8 @@ void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst
220
232
  float * dst_d = (float *)dst->data;
221
233
  cudaStream_t stream = ctx.stream();
222
234
 
235
+ GGML_ASSERT(ggml_is_contiguous(src0));
236
+
223
237
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
224
238
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
225
239
 
@@ -232,6 +246,8 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
232
246
  float * dst_d = (float *)dst->data;
233
247
  cudaStream_t stream = ctx.stream();
234
248
 
249
+ GGML_ASSERT(ggml_is_contiguous(src0));
250
+
235
251
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
236
252
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
237
253
 
@@ -244,6 +260,8 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
244
260
  float * dst_d = (float *)dst->data;
245
261
  cudaStream_t stream = ctx.stream();
246
262
 
263
+ GGML_ASSERT(ggml_is_contiguous(src0));
264
+
247
265
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
248
266
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
249
267
 
@@ -259,6 +277,8 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
259
277
  float * dst_d = (float *)dst->data;
260
278
  cudaStream_t stream = ctx.stream();
261
279
 
280
+ GGML_ASSERT(ggml_is_contiguous(src0));
281
+
262
282
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
263
283
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
264
284
 
@@ -188,13 +188,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
188
188
  info.default_tensor_split[id] = total_vram;
189
189
  total_vram += prop.totalGlobalMem;
190
190
 
191
+ info.devices[id].nsm = prop.multiProcessorCount;
192
+ info.devices[id].smpb = prop.sharedMemPerBlock;
191
193
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
194
+ info.devices[id].smpbo = prop.sharedMemPerBlock;
192
195
  info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
193
196
  #else
197
+ info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
194
198
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
195
199
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
196
- info.devices[id].smpb = prop.sharedMemPerBlock;
197
- info.devices[id].nsm = prop.multiProcessorCount;
198
200
  }
199
201
 
200
202
  for (int id = 0; id < info.device_count; ++id) {
@@ -543,6 +545,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
543
545
  return ctx->name.c_str();
544
546
  }
545
547
 
548
+ static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
549
+ return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
550
+ }
551
+
546
552
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
547
553
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
548
554
 
@@ -585,24 +591,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
585
591
  GGML_UNUSED(buft);
586
592
  }
587
593
 
588
- GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
589
- if (!ggml_backend_is_cuda(backend)) {
590
- return false;
591
- }
592
-
593
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
594
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
595
-
596
- return buft_ctx->device == cuda_ctx->device;
597
- }
598
-
599
594
  static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
600
595
  /* .get_name = */ ggml_backend_cuda_buffer_type_name,
601
596
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
602
597
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
603
598
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
604
599
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
605
- /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
606
600
  /* .is_host = */ NULL,
607
601
  };
608
602
 
@@ -633,88 +627,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
633
627
 
634
628
  // cuda split buffer
635
629
 
636
- static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
637
- int64_t min_compute_capability = INT_MAX;
638
- int64_t max_compute_capability = INT_MIN;
630
+ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
631
+ int64_t row_rounding = 0;
639
632
  for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
640
- if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
641
- if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
642
- min_compute_capability = ggml_cuda_info().devices[id].cc;
643
- }
644
- if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
645
- max_compute_capability = ggml_cuda_info().devices[id].cc;
646
- }
633
+ if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
634
+ continue;
647
635
  }
648
- }
649
636
 
650
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
651
- switch(type) {
652
- case GGML_TYPE_Q4_0:
653
- case GGML_TYPE_Q4_1:
654
- case GGML_TYPE_Q5_0:
655
- case GGML_TYPE_Q5_1:
656
- case GGML_TYPE_Q8_0:
657
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
658
- case GGML_TYPE_F16:
659
- case GGML_TYPE_F32:
660
- return 1;
661
- case GGML_TYPE_Q2_K:
662
- return max_compute_capability >= CC_RDNA2 ? 128 : 32;
663
- case GGML_TYPE_Q3_K:
664
- return min_compute_capability < CC_RDNA2 ? 128 : 64;
665
- case GGML_TYPE_Q4_K:
666
- case GGML_TYPE_Q5_K:
667
- case GGML_TYPE_Q6_K:
668
- case GGML_TYPE_IQ2_XXS:
669
- case GGML_TYPE_IQ2_XS:
670
- case GGML_TYPE_IQ2_S:
671
- case GGML_TYPE_IQ3_XXS:
672
- case GGML_TYPE_IQ1_S:
673
- case GGML_TYPE_IQ1_M:
674
- case GGML_TYPE_IQ4_NL:
675
- case GGML_TYPE_IQ4_XS:
676
- case GGML_TYPE_IQ3_S:
677
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
678
- default:
679
- GGML_ASSERT(false);
637
+ const int cc = ggml_cuda_info().devices[id].cc;
638
+ row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
680
639
  }
681
- #else
682
- switch(type) {
683
- case GGML_TYPE_Q4_0:
684
- case GGML_TYPE_Q4_1:
685
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
686
- case GGML_TYPE_Q5_0:
687
- case GGML_TYPE_Q5_1:
688
- case GGML_TYPE_Q8_0:
689
- return 64;
690
- case GGML_TYPE_F16:
691
- case GGML_TYPE_F32:
692
- return 1;
693
- case GGML_TYPE_Q2_K:
694
- case GGML_TYPE_Q3_K:
695
- case GGML_TYPE_Q4_K:
696
- case GGML_TYPE_Q5_K:
697
- case GGML_TYPE_IQ2_XXS:
698
- case GGML_TYPE_IQ2_XS:
699
- case GGML_TYPE_IQ2_S:
700
- case GGML_TYPE_IQ3_XXS:
701
- case GGML_TYPE_IQ1_S:
702
- case GGML_TYPE_IQ1_M:
703
- case GGML_TYPE_IQ4_NL:
704
- case GGML_TYPE_IQ4_XS:
705
- case GGML_TYPE_IQ3_S:
706
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
707
- case GGML_TYPE_Q6_K:
708
- return 64;
709
- default:
710
- GGML_ASSERT(false);
711
- }
712
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
640
+ return row_rounding;
713
641
  }
714
642
 
715
643
  static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
716
644
  const int64_t nrows = ggml_nrows(tensor);
717
- const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
645
+ const int64_t rounding = get_row_rounding(tensor_split);
718
646
 
719
647
  *row_low = id == 0 ? 0 : nrows*tensor_split[id];
720
648
  *row_low -= *row_low % rounding;
@@ -929,6 +857,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
929
857
  GGML_UNUSED(buft);
930
858
  }
931
859
 
860
+ static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
861
+ return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
862
+ }
863
+
932
864
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
933
865
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
934
866
  // instead, we allocate them for each tensor separately in init_tensor
@@ -972,12 +904,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
972
904
  return total_size;
973
905
  }
974
906
 
975
- GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
976
- return ggml_backend_is_cuda(backend);
977
-
978
- GGML_UNUSED(buft);
979
- }
980
-
981
907
  GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
982
908
  return false;
983
909
 
@@ -990,7 +916,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
990
916
  /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
991
917
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
992
918
  /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
993
- /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
994
919
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
995
920
  };
996
921
 
@@ -1090,7 +1015,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1090
1015
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1091
1016
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1092
1017
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1093
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
1094
1018
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1095
1019
  },
1096
1020
  /* .context = */ nullptr,
@@ -1413,10 +1337,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1413
1337
  GGML_UNUSED(main_device);
1414
1338
  }
1415
1339
 
1340
+ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
1341
+ void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
1342
+
1343
+ #if !defined(GGML_USE_HIPBLAS)
1344
+ // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1345
+ cudaMemcpy3DPeerParms p = {};
1346
+ p.dstDevice = dstDevice;
1347
+ p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
1348
+ p.srcDevice = srcDevice;
1349
+ p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
1350
+ p.extent = make_cudaExtent(width, height, 1);
1351
+ return cudaMemcpy3DPeerAsync(&p, stream);
1352
+ #else
1353
+ // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1354
+ GGML_UNUSED(dstDevice);
1355
+ GGML_UNUSED(srcDevice);
1356
+ return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
1357
+ #endif // !defined(GGML_USE_HIPBLAS)
1358
+ }
1359
+
1416
1360
  static void ggml_cuda_op_mul_mat(
1417
1361
  ggml_backend_cuda_context & ctx,
1418
1362
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
1419
- const bool convert_src1_to_q8_1) {
1363
+ quantize_cuda_t quantize_src1) {
1420
1364
 
1421
1365
  const int64_t ne00 = src0->ne[0];
1422
1366
  const int64_t ne01 = src0->ne[1];
@@ -1473,7 +1417,9 @@ static void ggml_cuda_op_mul_mat(
1473
1417
  }
1474
1418
 
1475
1419
  struct dev_data {
1476
- ggml_cuda_pool_alloc<char> src0_dd_alloc;
1420
+ int cc;
1421
+
1422
+ ggml_cuda_pool_alloc<char> src0_dd_alloc;
1477
1423
  ggml_cuda_pool_alloc<float> src1_ddf_alloc;
1478
1424
  ggml_cuda_pool_alloc<char> src1_ddq_alloc;
1479
1425
  ggml_cuda_pool_alloc<float> dst_dd_alloc;
@@ -1492,6 +1438,8 @@ static void ggml_cuda_op_mul_mat(
1492
1438
  int used_devices = 0;
1493
1439
 
1494
1440
  for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1441
+ dev[id].cc = ggml_cuda_info().devices[id].cc;
1442
+
1495
1443
  // by default, use all rows
1496
1444
  dev[id].row_low = 0;
1497
1445
  dev[id].row_high = ne01;
@@ -1499,7 +1447,7 @@ static void ggml_cuda_op_mul_mat(
1499
1447
  // for multi GPU, get the row boundaries from tensor split
1500
1448
  // and round to mul_mat_q tile sizes
1501
1449
  if (split) {
1502
- const int64_t rounding = get_row_rounding(src0->type, tensor_split);
1450
+ const int64_t rounding = get_row_rounding(tensor_split);
1503
1451
 
1504
1452
  if (id != 0) {
1505
1453
  dev[id].row_low = ne01*tensor_split[id];
@@ -1542,11 +1490,15 @@ static void ggml_cuda_op_mul_mat(
1542
1490
  dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
1543
1491
  }
1544
1492
 
1545
- if (convert_src1_to_q8_1) {
1546
- dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
1493
+ if (quantize_src1) {
1494
+ size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
1495
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1496
+ src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
1497
+ }
1498
+ dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
1547
1499
 
1548
1500
  if (src1_on_device && src1_is_contiguous) {
1549
- quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
1501
+ quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
1550
1502
  CUDA_CHECK(cudaGetLastError());
1551
1503
  }
1552
1504
  }
@@ -1592,7 +1544,12 @@ static void ggml_cuda_op_mul_mat(
1592
1544
  const int64_t i03 = i0 / ne12;
1593
1545
  const int64_t i02 = i0 % ne12;
1594
1546
 
1595
- const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
1547
+ size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1548
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1549
+ src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
1550
+ } else {
1551
+ src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1552
+ }
1596
1553
 
1597
1554
  // for split tensors the data begins at i0 == i0_offset_low
1598
1555
  char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
@@ -1609,10 +1566,17 @@ static void ggml_cuda_op_mul_mat(
1609
1566
  // copy src0, src1 to device if necessary
1610
1567
  if (src1_is_contiguous) {
1611
1568
  if (id != ctx.device) {
1612
- if (convert_src1_to_q8_1) {
1569
+ if (quantize_src1) {
1613
1570
  char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
1614
- CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
1615
- src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1571
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1572
+ const size_t pitch = ne11*sizeof(block_q8_1_mmq);
1573
+ const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
1574
+ const size_t height = src1_padded_col_size/(4*QK8_1);
1575
+ CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
1576
+ } else {
1577
+ CUDA_CHECK(cudaMemcpyPeerAsync(
1578
+ src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1579
+ }
1616
1580
  } else {
1617
1581
  float * src1_ddf_i_source = (float *) src1->data;
1618
1582
  src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
@@ -1627,8 +1591,8 @@ static void ggml_cuda_op_mul_mat(
1627
1591
  GGML_ASSERT(false);
1628
1592
  }
1629
1593
 
1630
- if (convert_src1_to_q8_1 && !src1_is_contiguous) {
1631
- quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
1594
+ if (quantize_src1 && !src1_is_contiguous) {
1595
+ quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
1632
1596
  CUDA_CHECK(cudaGetLastError());
1633
1597
  }
1634
1598
 
@@ -1653,22 +1617,8 @@ static void ggml_cuda_op_mul_mat(
1653
1617
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1654
1618
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1655
1619
  dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
1656
- #if !defined(GGML_USE_HIPBLAS)
1657
- // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1658
- cudaMemcpy3DPeerParms p = {};
1659
- p.dstDevice = ctx.device;
1660
- p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
1661
- p.srcDevice = id;
1662
- p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
1663
- p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
1664
- CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
1665
- #else
1666
- // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1667
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
1668
- dst_dd_i, row_diff*sizeof(float),
1669
- row_diff*sizeof(float), src1_ncols,
1670
- cudaMemcpyDeviceToDevice, stream));
1671
- #endif
1620
+ CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
1621
+ dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
1672
1622
  } else {
1673
1623
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1674
1624
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
@@ -2007,13 +1957,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
2007
1957
  // KQ + KQV multi-batch
2008
1958
  ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
2009
1959
  } else if (use_dequantize_mul_mat_vec) {
2010
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
1960
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
2011
1961
  } else if (use_mul_mat_vec_q) {
2012
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
1962
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
2013
1963
  } else if (use_mul_mat_q) {
2014
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
1964
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
2015
1965
  } else {
2016
- ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
1966
+ ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
2017
1967
  }
2018
1968
  }
2019
1969
 
@@ -2780,7 +2730,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2780
2730
  case GGML_UNARY_OP_HARDSWISH:
2781
2731
  case GGML_UNARY_OP_GELU_QUICK:
2782
2732
  case GGML_UNARY_OP_TANH:
2783
- return true;
2733
+ return ggml_is_contiguous(op->src[0]);
2784
2734
  default:
2785
2735
  return false;
2786
2736
  }
@@ -2919,6 +2869,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2919
2869
  GGML_UNUSED(backend);
2920
2870
  }
2921
2871
 
2872
+ GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
2873
+ if (ggml_backend_buft_is_cuda_split(buft)) {
2874
+ return true;
2875
+ }
2876
+
2877
+ if (ggml_backend_buft_is_cuda(buft)) {
2878
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2879
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
2880
+ return buft_ctx->device == cuda_ctx->device;
2881
+ }
2882
+
2883
+ return false;
2884
+ }
2885
+
2922
2886
  GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2923
2887
  const int min_batch_size = 32;
2924
2888
 
@@ -2991,9 +2955,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
2991
2955
  /* .synchronize = */ ggml_backend_cuda_synchronize,
2992
2956
  /* .graph_plan_create = */ NULL,
2993
2957
  /* .graph_plan_free = */ NULL,
2958
+ /* .graph_plan_update = */ NULL,
2994
2959
  /* .graph_plan_compute = */ NULL,
2995
2960
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2996
2961
  /* .supports_op = */ ggml_backend_cuda_supports_op,
2962
+ /* .supports_buft = */ ggml_backend_cuda_supports_buft,
2997
2963
  /* .offload_op = */ ggml_backend_cuda_offload_op,
2998
2964
  /* .event_new = */ ggml_backend_cuda_event_new,
2999
2965
  /* .event_free = */ ggml_backend_cuda_event_free,
@@ -1340,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
1340
1340
  case GGML_UNARY_OP_RELU:
1341
1341
  case GGML_UNARY_OP_GELU:
1342
1342
  case GGML_UNARY_OP_SILU:
1343
- return true;
1343
+ return ggml_is_contiguous(op->src[0]);
1344
1344
  default:
1345
1345
  ;
1346
1346
  }
@@ -1902,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
1902
1902
  return ctx->max_alloc;
1903
1903
  }
1904
1904
 
1905
- static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
1906
- GGML_UNUSED(buft);
1907
- return ggml_backend_is_kompute(backend);
1908
- }
1909
-
1910
1905
  static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
1911
1906
  /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
1912
1907
  /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
1913
1908
  /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
1914
1909
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
1915
1910
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1916
- /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
1917
1911
  /* .is_host = */ NULL,
1918
1912
  };
1919
1913
 
@@ -1973,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
1973
1967
  return ggml_vk_supports_op(op);
1974
1968
  }
1975
1969
 
1970
+ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
1971
+ GGML_UNUSED(backend);
1972
+ return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
1973
+ }
1974
+
1976
1975
  static struct ggml_backend_i kompute_backend_i = {
1977
1976
  /* .get_name = */ ggml_backend_kompute_name,
1978
1977
  /* .free = */ ggml_backend_kompute_free,
@@ -1983,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
1983
1982
  /* .synchronize = */ NULL,
1984
1983
  /* .graph_plan_create = */ NULL,
1985
1984
  /* .graph_plan_free = */ NULL,
1985
+ /* .graph_plan_update = */ NULL,
1986
1986
  /* .graph_plan_compute = */ NULL,
1987
1987
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1988
1988
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1989
+ /* .supports_buft = */ ggml_backend_kompute_supports_buft,
1989
1990
  /* .offload_op = */ NULL,
1990
1991
  /* .event_new = */ NULL,
1991
1992
  /* .event_free = */ NULL,
@@ -744,7 +744,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
744
744
  case GGML_UNARY_OP_GELU:
745
745
  case GGML_UNARY_OP_GELU_QUICK:
746
746
  case GGML_UNARY_OP_SILU:
747
- return true;
747
+ return ggml_is_contiguous(op->src[0]);
748
748
  default:
749
749
  return false;
750
750
  }
@@ -1862,9 +1862,10 @@ static enum ggml_status ggml_metal_graph_compute(
1862
1862
  // ne21 = n_rows
1863
1863
  const int dst_rows = ne20*ne21;
1864
1864
  const int dst_rows_min = n_as;
1865
+ const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;
1865
1866
 
1866
1867
  // max size of the rowids array in the kernel shared buffer
1867
- GGML_ASSERT(dst_rows <= 2048);
1868
+ GGML_ASSERT(dst_rows <= dst_rows_max);
1868
1869
 
1869
1870
  // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
1870
1871
  // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
@@ -3044,12 +3045,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
3044
3045
  UNUSED(buft);
3045
3046
  }
3046
3047
 
3047
- GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
3048
- return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
3049
-
3050
- UNUSED(buft);
3051
- }
3052
-
3053
3048
  GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
3054
3049
  return true;
3055
3050
 
@@ -3064,7 +3059,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
3064
3059
  /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
3065
3060
  /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
3066
3061
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
3067
- /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
3068
3062
  /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
3069
3063
  },
3070
3064
  /* .context = */ NULL,
@@ -3179,6 +3173,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
3179
3173
  return ggml_metal_supports_op(metal_ctx, op);
3180
3174
  }
3181
3175
 
3176
+ GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
3177
+ return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
3178
+
3179
+ UNUSED(backend);
3180
+ }
3181
+
3182
3182
  static struct ggml_backend_i ggml_backend_metal_i = {
3183
3183
  /* .get_name = */ ggml_backend_metal_name,
3184
3184
  /* .free = */ ggml_backend_metal_free,
@@ -3189,9 +3189,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
3189
3189
  /* .synchronize = */ NULL,
3190
3190
  /* .graph_plan_create = */ NULL,
3191
3191
  /* .graph_plan_free = */ NULL,
3192
+ /* .graph_plan_update = */ NULL,
3192
3193
  /* .graph_plan_compute = */ NULL,
3193
3194
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
3194
3195
  /* .supports_op = */ ggml_backend_metal_supports_op,
3196
+ /* .supports_buft = */ ggml_backend_metal_supports_buft,
3195
3197
  /* .offload_op = */ NULL,
3196
3198
  /* .event_new = */ NULL,
3197
3199
  /* .event_free = */ NULL,
@@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend
540
540
  return ggml_nbytes(tensor);
541
541
  }
542
542
 
543
- GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
544
- if (!ggml_backend_is_rpc(backend)) {
545
- return false;
546
- }
547
- ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
548
- ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
549
- return buft_ctx->endpoint == rpc_ctx->endpoint;
550
- }
551
-
552
543
  static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
553
544
  /* .get_name = */ ggml_backend_rpc_buffer_type_name,
554
545
  /* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
555
546
  /* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
556
547
  /* .get_max_size = */ ggml_backend_rpc_get_max_size,
557
548
  /* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
558
- /* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
559
549
  /* .is_host = */ NULL,
560
550
  };
561
551
 
@@ -634,8 +624,17 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
634
624
  GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
635
625
  UNUSED(backend);
636
626
  UNUSED(op);
637
- GGML_ASSERT(false && "not implemented");
638
- return false;
627
+ //TODO: call the remote backend and cache the results
628
+ return true;
629
+ }
630
+
631
+ GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
632
+ if (buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
633
+ return false;
634
+ }
635
+ ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
636
+ ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
637
+ return buft_ctx->endpoint == rpc_ctx->endpoint;
639
638
  }
640
639
 
641
640
  static ggml_backend_i ggml_backend_rpc_interface = {
@@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
648
647
  /* .synchronize = */ ggml_backend_rpc_synchronize,
649
648
  /* .graph_plan_create = */ NULL,
650
649
  /* .graph_plan_free = */ NULL,
650
+ /* .graph_plan_update = */ NULL,
651
651
  /* .graph_plan_compute = */ NULL,
652
652
  /* .graph_compute = */ ggml_backend_rpc_graph_compute,
653
653
  /* .supports_op = */ ggml_backend_rpc_supports_op,
654
+ /* .supports_buft = */ ggml_backend_rpc_supports_buft,
654
655
  /* .offload_op = */ NULL,
655
656
  /* .event_new = */ NULL,
656
657
  /* .event_free = */ NULL,