llama_cpp 0.16.0 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +119 -54
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
- data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
- data/vendor/tmp/llama.cpp/ggml.c +158 -414
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +628 -279
- data/vendor/tmp/llama.cpp/llama.h +9 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +15 -3
@@ -1,5 +1,5 @@
|
|
1
1
|
#include "ggml-vulkan.h"
|
2
|
-
|
2
|
+
#include <vulkan/vulkan_core.h>
|
3
3
|
#ifdef GGML_VULKAN_RUN_TESTS
|
4
4
|
#include <chrono>
|
5
5
|
#endif
|
@@ -8,13 +8,15 @@
|
|
8
8
|
|
9
9
|
#include <algorithm>
|
10
10
|
#include <cmath>
|
11
|
+
#include <iomanip>
|
11
12
|
#include <iostream>
|
12
|
-
#include <limits>
|
13
13
|
#include <tuple>
|
14
14
|
#include <vector>
|
15
15
|
#include <sstream>
|
16
16
|
#include <utility>
|
17
17
|
#include <memory>
|
18
|
+
#include <limits>
|
19
|
+
#include <map>
|
18
20
|
|
19
21
|
#include "ggml.h"
|
20
22
|
#include "ggml-backend-impl.h"
|
@@ -56,6 +58,12 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
56
58
|
} \
|
57
59
|
} while (0)
|
58
60
|
|
61
|
+
#ifdef GGML_VULKAN_DEBUG
|
62
|
+
#define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl
|
63
|
+
#else
|
64
|
+
#define VK_LOG_DEBUG(msg) ((void) 0)
|
65
|
+
#endif // GGML_VULKAN_DEBUG
|
66
|
+
|
59
67
|
struct ggml_backend_vk_context;
|
60
68
|
|
61
69
|
struct vk_queue {
|
@@ -150,7 +158,7 @@ struct vk_device {
|
|
150
158
|
vk_pipeline pipeline_relu_f32;
|
151
159
|
vk_pipeline pipeline_diag_mask_inf_f32;
|
152
160
|
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
153
|
-
vk_pipeline
|
161
|
+
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
|
154
162
|
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
155
163
|
vk_pipeline pipeline_argsort_f32;
|
156
164
|
vk_pipeline pipeline_sum_rows_f32;
|
@@ -158,9 +166,7 @@ struct vk_device {
|
|
158
166
|
std::vector<vk_pipeline_ref> pipelines;
|
159
167
|
|
160
168
|
~vk_device() {
|
161
|
-
|
162
|
-
std::cerr << "destroy device " << name << std::endl;
|
163
|
-
#endif
|
169
|
+
VK_LOG_DEBUG("destroy device " << name);
|
164
170
|
device.destroyCommandPool(compute_queue.pool);
|
165
171
|
if (!single_queue) {
|
166
172
|
device.destroyCommandPool(transfer_queue.pool);
|
@@ -195,9 +201,7 @@ struct vk_buffer_struct {
|
|
195
201
|
if (size == 0) {
|
196
202
|
return;
|
197
203
|
}
|
198
|
-
|
199
|
-
std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl;
|
200
|
-
#endif
|
204
|
+
VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")");
|
201
205
|
|
202
206
|
device->device.freeMemory(device_memory);
|
203
207
|
device->device.destroyBuffer(buffer);
|
@@ -283,26 +287,15 @@ struct vk_op_diag_mask_push_constants {
|
|
283
287
|
|
284
288
|
struct vk_op_rope_push_constants {
|
285
289
|
uint32_t ncols;
|
290
|
+
uint32_t n_dims;
|
286
291
|
float freq_scale;
|
287
292
|
uint32_t p_delta_rows;
|
288
293
|
float freq_base;
|
289
294
|
float ext_factor;
|
290
295
|
float attn_factor;
|
291
|
-
float corr_dims[
|
292
|
-
};
|
293
|
-
|
294
|
-
struct vk_op_rope_neox_push_constants {
|
295
|
-
uint32_t ncols;
|
296
|
-
uint32_t ndims;
|
297
|
-
float freq_scale;
|
298
|
-
uint32_t p_delta_rows;
|
299
|
-
float freq_base;
|
300
|
-
float ext_factor;
|
301
|
-
float attn_factor;
|
302
|
-
float corr_dims[4];
|
296
|
+
float corr_dims[2];
|
303
297
|
float theta_scale;
|
304
|
-
|
305
|
-
uint32_t has_freq_facs;
|
298
|
+
uint32_t has_ff;
|
306
299
|
};
|
307
300
|
|
308
301
|
struct vk_op_soft_max_push_constants {
|
@@ -345,15 +338,12 @@ struct vk_context {
|
|
345
338
|
};
|
346
339
|
|
347
340
|
struct ggml_tensor_extra_gpu {
|
348
|
-
bool ready;
|
349
|
-
|
350
341
|
size_t ctx_idx;
|
351
342
|
|
352
343
|
vk_buffer_ref buffer_gpu;
|
353
344
|
uint64_t offset;
|
354
345
|
|
355
346
|
void reset() {
|
356
|
-
ready = false;
|
357
347
|
ctx_idx = 0;
|
358
348
|
buffer_gpu.reset();
|
359
349
|
offset = 0;
|
@@ -368,6 +358,49 @@ struct ggml_vk_garbage_collector {
|
|
368
358
|
std::vector<vk_context> contexts;
|
369
359
|
};
|
370
360
|
|
361
|
+
#if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
|
362
|
+
#include <mutex>
|
363
|
+
|
364
|
+
#define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
|
365
|
+
|
366
|
+
static std::string format_size(size_t size) {
|
367
|
+
const size_t kib = 1024;
|
368
|
+
const size_t mib = kib * 1024;
|
369
|
+
const size_t gib = mib * 1024;
|
370
|
+
|
371
|
+
std::ostringstream oss;
|
372
|
+
oss << std::fixed << std::setprecision(2);
|
373
|
+
|
374
|
+
if (size >= gib) {
|
375
|
+
oss << static_cast<double>(size) / gib << " GiB";
|
376
|
+
} else if (size >= mib) {
|
377
|
+
oss << static_cast<double>(size) / mib << " MiB";
|
378
|
+
} else if (size >= kib) {
|
379
|
+
oss << static_cast<double>(size) / kib << " KiB";
|
380
|
+
} else {
|
381
|
+
oss << size << " B";
|
382
|
+
}
|
383
|
+
|
384
|
+
return oss.str();
|
385
|
+
}
|
386
|
+
|
387
|
+
static std::mutex log_mutex;
|
388
|
+
|
389
|
+
class vk_memory_logger {
|
390
|
+
public:
|
391
|
+
vk_memory_logger(): total_device(0), total_host(0) {}
|
392
|
+
void log_allocation(vk_buffer_ref buf_ref, size_t size);
|
393
|
+
void log_deallocation(vk_buffer_ref buf_ref);
|
394
|
+
|
395
|
+
private:
|
396
|
+
std::map<vk::Buffer, size_t> allocations; // Track allocations
|
397
|
+
size_t total_device;
|
398
|
+
size_t total_host;
|
399
|
+
};
|
400
|
+
#else
|
401
|
+
#define VK_LOG_MEMORY(msg) ((void) 0)
|
402
|
+
#endif // GGML_VULKAN_MEMORY_DEBUG
|
403
|
+
|
371
404
|
struct ggml_backend_vk_context {
|
372
405
|
std::string name;
|
373
406
|
|
@@ -392,8 +425,45 @@ struct ggml_backend_vk_context {
|
|
392
425
|
bool initialized;
|
393
426
|
|
394
427
|
size_t idx;
|
428
|
+
|
429
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
430
|
+
vk_memory_logger memory_logger;
|
431
|
+
#endif
|
395
432
|
};
|
396
433
|
|
434
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
435
|
+
void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
|
436
|
+
std::lock_guard<std::mutex> guard(log_mutex);
|
437
|
+
vk_buffer buf = buf_ref.lock();
|
438
|
+
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
439
|
+
const std::string type = device ? "device" : "host";
|
440
|
+
allocations[buf->buffer] = size;
|
441
|
+
total_device += device ? size : 0;
|
442
|
+
total_host += device ? 0 : size;
|
443
|
+
VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
|
444
|
+
}
|
445
|
+
|
446
|
+
void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
|
447
|
+
if (buf_ref.expired() || buf_ref.lock()->size == 0) {
|
448
|
+
return;
|
449
|
+
}
|
450
|
+
|
451
|
+
std::lock_guard<std::mutex> guard(log_mutex);
|
452
|
+
vk_buffer buf = buf_ref.lock();
|
453
|
+
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
454
|
+
std::string type = device ? "device" : "host";
|
455
|
+
auto it = allocations.find(buf->buffer);
|
456
|
+
total_device -= device ? it->second : 0;
|
457
|
+
total_host -= device ? 0 : it->second;
|
458
|
+
if (it != allocations.end()) {
|
459
|
+
VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
|
460
|
+
allocations.erase(it);
|
461
|
+
} else {
|
462
|
+
VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
|
463
|
+
}
|
464
|
+
}
|
465
|
+
#endif // GGML_VULKAN_MEMORY_DEBUG
|
466
|
+
|
397
467
|
struct vk_instance_t {
|
398
468
|
vk::Instance instance;
|
399
469
|
|
@@ -406,15 +476,11 @@ struct vk_instance_t {
|
|
406
476
|
};
|
407
477
|
|
408
478
|
static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
|
409
|
-
|
410
|
-
std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl;
|
411
|
-
#endif
|
479
|
+
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
|
412
480
|
static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
|
413
481
|
|
414
482
|
if (devices[idx].expired()) {
|
415
|
-
|
416
|
-
std::cerr << "Initializing new vk_device" << std::endl;
|
417
|
-
#endif
|
483
|
+
VK_LOG_DEBUG("Initializing new vk_device");
|
418
484
|
std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
|
419
485
|
device->initialized = false;
|
420
486
|
devices[idx] = device;
|
@@ -441,9 +507,7 @@ static vk_instance_t vk_instance;
|
|
441
507
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
442
508
|
|
443
509
|
static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
|
444
|
-
|
445
|
-
std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
|
446
|
-
#endif
|
510
|
+
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
447
511
|
GGML_ASSERT(parameter_count > 0);
|
448
512
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
449
513
|
|
@@ -544,9 +608,7 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
|
|
544
608
|
}
|
545
609
|
|
546
610
|
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
547
|
-
|
548
|
-
std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl;
|
549
|
-
#endif
|
611
|
+
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
|
550
612
|
for (auto& pool : pipeline->descriptor_pools) {
|
551
613
|
device.destroyDescriptorPool(pool);
|
552
614
|
}
|
@@ -564,9 +626,7 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
|
564
626
|
}
|
565
627
|
|
566
628
|
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
|
567
|
-
|
568
|
-
std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl;
|
569
|
-
#endif
|
629
|
+
VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
570
630
|
if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
|
571
631
|
// Enough descriptors are available
|
572
632
|
return;
|
@@ -596,16 +656,12 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
|
|
596
656
|
}
|
597
657
|
|
598
658
|
static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
|
599
|
-
|
600
|
-
std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl;
|
601
|
-
#endif
|
659
|
+
VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
|
602
660
|
pipeline->descriptor_set_idx = 0;
|
603
661
|
}
|
604
662
|
|
605
663
|
static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
|
606
|
-
|
607
|
-
std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl;
|
608
|
-
#endif
|
664
|
+
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
609
665
|
if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
|
610
666
|
// Reuse command buffer
|
611
667
|
return q.cmd_buffers[q.cmd_buffer_idx++];
|
@@ -625,9 +681,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx
|
|
625
681
|
}
|
626
682
|
|
627
683
|
static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
628
|
-
|
629
|
-
std::cerr << "ggml_vk_create_submission()" << std::endl;
|
630
|
-
#endif
|
684
|
+
VK_LOG_DEBUG("ggml_vk_create_submission()");
|
631
685
|
vk_submission s;
|
632
686
|
s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
|
633
687
|
s.wait_semaphores = std::move(wait_semaphores);
|
@@ -636,9 +690,7 @@ static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk
|
|
636
690
|
}
|
637
691
|
|
638
692
|
static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
|
639
|
-
|
640
|
-
std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl;
|
641
|
-
#endif
|
693
|
+
VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
|
642
694
|
if (ctx->seqs.empty()) {
|
643
695
|
return;
|
644
696
|
}
|
@@ -712,9 +764,7 @@ static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
|
|
712
764
|
}
|
713
765
|
|
714
766
|
static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
|
715
|
-
|
716
|
-
std::cerr << "ggml_vk_find_queue_family_index()" << std::endl;
|
717
|
-
#endif
|
767
|
+
VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
|
718
768
|
const uint32_t qfsize = queue_family_props.size();
|
719
769
|
|
720
770
|
// Try with avoid preferences first
|
@@ -760,9 +810,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
|
760
810
|
}
|
761
811
|
|
762
812
|
static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
|
763
|
-
|
764
|
-
std::cerr << "ggml_vk_create_queue()" << std::endl;
|
765
|
-
#endif
|
813
|
+
VK_LOG_DEBUG("ggml_vk_create_queue()");
|
766
814
|
q.queue_family_index = queue_family_index;
|
767
815
|
|
768
816
|
vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
|
@@ -776,9 +824,7 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin
|
|
776
824
|
}
|
777
825
|
|
778
826
|
static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
|
779
|
-
|
780
|
-
std::cerr << "ggml_vk_create_context()" << std::endl;
|
781
|
-
#endif
|
827
|
+
VK_LOG_DEBUG("ggml_vk_create_context()");
|
782
828
|
ctx->gc.contexts.emplace_back();
|
783
829
|
vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
|
784
830
|
memset((void *) result, 0, sizeof(vk_context));
|
@@ -788,9 +834,7 @@ static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_que
|
|
788
834
|
}
|
789
835
|
|
790
836
|
static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
|
791
|
-
|
792
|
-
std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
|
793
|
-
#endif
|
837
|
+
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
|
794
838
|
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
|
795
839
|
vk::SemaphoreCreateInfo ci{};
|
796
840
|
ci.setPNext(&tci);
|
@@ -800,9 +844,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context *
|
|
800
844
|
}
|
801
845
|
|
802
846
|
static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
|
803
|
-
|
804
|
-
std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
|
805
|
-
#endif
|
847
|
+
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
|
806
848
|
if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
|
807
849
|
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
|
808
850
|
vk::SemaphoreCreateInfo ci{};
|
@@ -821,9 +863,7 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
|
|
821
863
|
}
|
822
864
|
|
823
865
|
static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
824
|
-
|
825
|
-
std::cerr << "ggml_vk_queue_cleanup()" << std::endl;
|
826
|
-
#endif
|
866
|
+
VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
|
827
867
|
// Requires command buffers to be done
|
828
868
|
|
829
869
|
ctx->device->device.resetCommandPool(q.pool);
|
@@ -843,9 +883,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|
843
883
|
}
|
844
884
|
|
845
885
|
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
846
|
-
|
847
|
-
std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
848
|
-
#endif
|
886
|
+
VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
|
849
887
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
850
888
|
|
851
889
|
if (size == 0) {
|
@@ -905,8 +943,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
905
943
|
|
906
944
|
buf->device = ctx->device;
|
907
945
|
|
908
|
-
#ifdef
|
909
|
-
|
946
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
947
|
+
ctx->memory_logger.log_allocation(buf, size);
|
910
948
|
#endif
|
911
949
|
|
912
950
|
return buf;
|
@@ -941,6 +979,14 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz
|
|
941
979
|
}
|
942
980
|
|
943
981
|
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
982
|
+
if (buf == nullptr) {
|
983
|
+
return;
|
984
|
+
}
|
985
|
+
|
986
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
987
|
+
buf->ctx->memory_logger.log_deallocation(buf);
|
988
|
+
#endif
|
989
|
+
|
944
990
|
buf.reset();
|
945
991
|
}
|
946
992
|
|
@@ -949,9 +995,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
|
949
995
|
}
|
950
996
|
|
951
997
|
static void ggml_vk_sync_buffers(vk_context * ctx) {
|
952
|
-
|
953
|
-
std::cerr << "ggml_vk_sync_buffers()" << std::endl;
|
954
|
-
#endif
|
998
|
+
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
955
999
|
const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
|
956
1000
|
|
957
1001
|
ctx->s->buffer.pipelineBarrier(
|
@@ -965,9 +1009,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) {
|
|
965
1009
|
}
|
966
1010
|
|
967
1011
|
static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
|
968
|
-
|
969
|
-
std::cerr << "ggml_vk_wait_events()" << std::endl;
|
970
|
-
#endif
|
1012
|
+
VK_LOG_DEBUG("ggml_vk_wait_events()");
|
971
1013
|
if (events.empty()) {
|
972
1014
|
return;
|
973
1015
|
}
|
@@ -1002,9 +1044,7 @@ static bool ggml_vk_build_shader(ggml_type type) {
|
|
1002
1044
|
}
|
1003
1045
|
|
1004
1046
|
static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
1005
|
-
|
1006
|
-
std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
|
1007
|
-
#endif
|
1047
|
+
VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")");
|
1008
1048
|
|
1009
1049
|
const std::shared_ptr<vk_device> device = ctx->device;
|
1010
1050
|
|
@@ -1055,12 +1095,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1055
1095
|
ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1056
1096
|
|
1057
1097
|
if (device->fp16) {
|
1058
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l",
|
1059
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m",
|
1060
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s",
|
1061
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l",
|
1062
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m",
|
1063
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s",
|
1098
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1099
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1100
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1101
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1102
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1103
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1064
1104
|
|
1065
1105
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1066
1106
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1153,12 +1193,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1153
1193
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
1154
1194
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
1155
1195
|
|
1156
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l",
|
1157
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m",
|
1158
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s",
|
1159
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l",
|
1160
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m",
|
1161
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s",
|
1196
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1197
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
1198
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
1199
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
1200
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
1201
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
1162
1202
|
|
1163
1203
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1164
1204
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1244,12 +1284,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1244
1284
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
1245
1285
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
1246
1286
|
} else {
|
1247
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l",
|
1248
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m",
|
1249
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s",
|
1250
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l",
|
1251
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m",
|
1252
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s",
|
1287
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1288
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1289
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1290
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1291
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1292
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1253
1293
|
|
1254
1294
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1255
1295
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1342,12 +1382,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1342
1382
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
1343
1383
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
1344
1384
|
|
1345
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l",
|
1346
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m",
|
1347
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s",
|
1348
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l",
|
1349
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m",
|
1350
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s",
|
1385
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1386
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
1387
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
1388
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
1389
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
1390
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
1351
1391
|
|
1352
1392
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1353
1393
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1442,11 +1482,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1442
1482
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1443
1483
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1444
1484
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1445
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "
|
1446
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "
|
1447
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "
|
1448
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "
|
1449
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "
|
1485
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1486
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1487
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1488
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1489
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1450
1490
|
|
1451
1491
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1452
1492
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
@@ -1455,11 +1495,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1455
1495
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1456
1496
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1457
1497
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1458
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "
|
1459
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "
|
1460
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "
|
1461
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "
|
1462
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "
|
1498
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1499
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1500
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1501
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1502
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1463
1503
|
|
1464
1504
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1465
1505
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
@@ -1468,11 +1508,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1468
1508
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1469
1509
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1470
1510
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1471
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "
|
1472
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "
|
1473
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "
|
1474
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "
|
1475
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "
|
1511
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1512
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1513
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1514
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1515
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1476
1516
|
|
1477
1517
|
// dequant shaders
|
1478
1518
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
@@ -1481,11 +1521,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1481
1521
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1482
1522
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1483
1523
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1484
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "
|
1485
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "
|
1486
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "
|
1487
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "
|
1488
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "
|
1524
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1525
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1526
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
1527
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1528
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1489
1529
|
|
1490
1530
|
// get_rows
|
1491
1531
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
@@ -1537,11 +1577,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1537
1577
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1538
1578
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1539
1579
|
|
1540
|
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
1541
|
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
1580
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1581
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1542
1582
|
|
1543
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(
|
1544
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(
|
1583
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1584
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1545
1585
|
|
1546
1586
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
1547
1587
|
|
@@ -1551,9 +1591,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1551
1591
|
static void ggml_vk_print_gpu_info(size_t idx) {
|
1552
1592
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
1553
1593
|
size_t dev_num = vk_instance.device_indices[idx];
|
1554
|
-
|
1555
|
-
std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl;
|
1556
|
-
#endif
|
1594
|
+
VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")");
|
1557
1595
|
GGML_ASSERT(vk_instance.initialized);
|
1558
1596
|
|
1559
1597
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
@@ -1569,8 +1607,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
1569
1607
|
vk::PhysicalDeviceProperties2 props2;
|
1570
1608
|
vk::PhysicalDeviceMaintenance3Properties props3;
|
1571
1609
|
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
1610
|
+
vk::PhysicalDeviceDriverProperties driver_props;
|
1572
1611
|
props2.pNext = &props3;
|
1573
1612
|
props3.pNext = &subgroup_props;
|
1613
|
+
subgroup_props.pNext = &driver_props;
|
1574
1614
|
physical_device.getProperties2(&props2);
|
1575
1615
|
|
1576
1616
|
const size_t subgroup_size = subgroup_props.subgroupSize;
|
@@ -1614,7 +1654,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
1614
1654
|
fp16 = fp16 && vk12_features.shaderFloat16;
|
1615
1655
|
|
1616
1656
|
std::string device_name = props2.properties.deviceName.data();
|
1617
|
-
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
1657
|
+
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
1618
1658
|
|
1619
1659
|
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
1620
1660
|
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
@@ -1628,9 +1668,7 @@ void ggml_vk_instance_init() {
|
|
1628
1668
|
if (vk_instance_initialized) {
|
1629
1669
|
return;
|
1630
1670
|
}
|
1631
|
-
|
1632
|
-
std::cerr << "ggml_vk_instance_init()" << std::endl;
|
1633
|
-
#endif
|
1671
|
+
VK_LOG_DEBUG("ggml_vk_instance_init()");
|
1634
1672
|
|
1635
1673
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
1636
1674
|
|
@@ -1707,10 +1745,80 @@ void ggml_vk_instance_init() {
|
|
1707
1745
|
|
1708
1746
|
// Default to using all dedicated GPUs
|
1709
1747
|
for (size_t i = 0; i < devices.size(); i++) {
|
1710
|
-
vk::
|
1748
|
+
vk::PhysicalDeviceProperties2 new_props;
|
1749
|
+
vk::PhysicalDeviceDriverProperties new_driver;
|
1750
|
+
vk::PhysicalDeviceIDProperties new_id;
|
1751
|
+
new_props.pNext = &new_driver;
|
1752
|
+
new_driver.pNext = &new_id;
|
1753
|
+
devices[i].getProperties2(&new_props);
|
1754
|
+
|
1755
|
+
if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
1756
|
+
// Check if there are two physical devices corresponding to the same GPU
|
1757
|
+
auto old_device = std::find_if(
|
1758
|
+
vk_instance.device_indices.begin(),
|
1759
|
+
vk_instance.device_indices.end(),
|
1760
|
+
[&devices, &new_id](const size_t k){
|
1761
|
+
vk::PhysicalDeviceProperties2 old_props;
|
1762
|
+
vk::PhysicalDeviceIDProperties old_id;
|
1763
|
+
old_props.pNext = &old_id;
|
1764
|
+
devices[k].getProperties2(&old_props);
|
1765
|
+
return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
|
1766
|
+
}
|
1767
|
+
);
|
1768
|
+
if (old_device == vk_instance.device_indices.end()) {
|
1769
|
+
vk_instance.device_indices.push_back(i);
|
1770
|
+
} else {
|
1771
|
+
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
|
1772
|
+
// This can cause error when splitting layers aross the devices, need to keep only 1
|
1773
|
+
VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID");
|
1774
|
+
|
1775
|
+
vk::PhysicalDeviceProperties2 old_props;
|
1776
|
+
vk::PhysicalDeviceDriverProperties old_driver;
|
1777
|
+
old_props.pNext = &old_driver;
|
1778
|
+
devices[*old_device].getProperties2(&old_props);
|
1779
|
+
|
1780
|
+
std::map<vk::DriverId, int> driver_priorities {};
|
1781
|
+
int old_priority = std::numeric_limits<int>::max();
|
1782
|
+
int new_priority = std::numeric_limits<int>::max();
|
1783
|
+
|
1784
|
+
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
|
1785
|
+
// Smaller number -> higher priority
|
1786
|
+
switch (old_props.properties.vendorID) {
|
1787
|
+
case VK_VENDOR_ID_AMD:
|
1788
|
+
driver_priorities[vk::DriverId::eMesaRadv] = 1;
|
1789
|
+
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
|
1790
|
+
driver_priorities[vk::DriverId::eAmdProprietary] = 3;
|
1791
|
+
break;
|
1792
|
+
case VK_VENDOR_ID_INTEL:
|
1793
|
+
driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
|
1794
|
+
driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
|
1795
|
+
break;
|
1796
|
+
case VK_VENDOR_ID_NVIDIA:
|
1797
|
+
driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
|
1798
|
+
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
|
1799
|
+
driver_priorities[vk::DriverId::eMesaNvk] = 2;
|
1800
|
+
#endif
|
1801
|
+
break;
|
1802
|
+
}
|
1803
|
+
|
1804
|
+
if (driver_priorities.count(old_driver.driverID)) {
|
1805
|
+
old_priority = driver_priorities[old_driver.driverID];
|
1806
|
+
}
|
1807
|
+
if (driver_priorities.count(new_driver.driverID)) {
|
1808
|
+
new_priority = driver_priorities[new_driver.driverID];
|
1809
|
+
}
|
1810
|
+
|
1811
|
+
if (new_priority < old_priority) {
|
1812
|
+
auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
|
1813
|
+
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
|
1814
|
+
vk_instance.device_indices.push_back(i);
|
1711
1815
|
|
1712
|
-
|
1713
|
-
|
1816
|
+
VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName);
|
1817
|
+
}
|
1818
|
+
else {
|
1819
|
+
VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl);
|
1820
|
+
}
|
1821
|
+
}
|
1714
1822
|
}
|
1715
1823
|
}
|
1716
1824
|
|
@@ -1732,9 +1840,7 @@ void ggml_vk_instance_init() {
|
|
1732
1840
|
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
1733
1841
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
1734
1842
|
size_t dev_num = vk_instance.device_indices[idx];
|
1735
|
-
|
1736
|
-
std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl;
|
1737
|
-
#endif
|
1843
|
+
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")");
|
1738
1844
|
ggml_vk_instance_init();
|
1739
1845
|
|
1740
1846
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
@@ -1907,9 +2013,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1907
2013
|
}
|
1908
2014
|
|
1909
2015
|
static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
|
1910
|
-
|
1911
|
-
std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
|
1912
|
-
#endif
|
2016
|
+
VK_LOG_DEBUG("ggml_vk_get_to_fp16()");
|
1913
2017
|
switch (type) {
|
1914
2018
|
case GGML_TYPE_F32:
|
1915
2019
|
case GGML_TYPE_Q4_0:
|
@@ -1931,9 +2035,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
|
1931
2035
|
}
|
1932
2036
|
|
1933
2037
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
1934
|
-
|
1935
|
-
std::cerr << "ggml_vk_get_mul_mat_mat_pipeline()" << std::endl;
|
1936
|
-
#endif
|
2038
|
+
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
|
1937
2039
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
1938
2040
|
return ctx->device->pipeline_matmul_f32;
|
1939
2041
|
}
|
@@ -1969,9 +2071,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
1969
2071
|
}
|
1970
2072
|
|
1971
2073
|
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
1972
|
-
|
1973
|
-
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
1974
|
-
#endif
|
2074
|
+
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
1975
2075
|
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
|
1976
2076
|
|
1977
2077
|
switch (a_type) {
|
@@ -1996,9 +2096,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
|
1996
2096
|
}
|
1997
2097
|
|
1998
2098
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
1999
|
-
|
2000
|
-
std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl;
|
2001
|
-
#endif
|
2099
|
+
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()");
|
2002
2100
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
2003
2101
|
return ctx->device->pipeline_matmul_id_f32;
|
2004
2102
|
}
|
@@ -2031,9 +2129,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
|
2031
2129
|
}
|
2032
2130
|
|
2033
2131
|
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
2034
|
-
|
2035
|
-
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
2036
|
-
#endif
|
2132
|
+
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
2037
2133
|
GGML_ASSERT(b_type == GGML_TYPE_F32);
|
2038
2134
|
|
2039
2135
|
switch (a_type) {
|
@@ -2058,9 +2154,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
|
|
2058
2154
|
}
|
2059
2155
|
|
2060
2156
|
static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
2061
|
-
|
2062
|
-
|
2063
|
-
|
2157
|
+
VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
|
2158
|
+
VK_LOG_MEMORY("ggml_vk_pool_malloc");
|
2159
|
+
|
2064
2160
|
int best_i = -1;
|
2065
2161
|
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
|
2066
2162
|
int worst_i = -1;
|
@@ -2088,13 +2184,11 @@ static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size)
|
|
2088
2184
|
ggml_vk_destroy_buffer(b);
|
2089
2185
|
}
|
2090
2186
|
|
2091
|
-
return
|
2187
|
+
return ggml_vk_create_buffer_device(ctx, size);
|
2092
2188
|
}
|
2093
2189
|
|
2094
2190
|
static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
|
2095
|
-
|
2096
|
-
std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl;
|
2097
|
-
#endif
|
2191
|
+
VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
|
2098
2192
|
for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
|
2099
2193
|
vk_buffer& b = ctx->buffer_pool[i];
|
2100
2194
|
if (b == nullptr) {
|
@@ -2115,6 +2209,8 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
|
|
2115
2209
|
}
|
2116
2210
|
}
|
2117
2211
|
|
2212
|
+
VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
|
2213
|
+
|
2118
2214
|
// Otherwise create new buffer
|
2119
2215
|
vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
|
2120
2216
|
ctx->gc.temp_buffers.push_back(buf);
|
@@ -2123,9 +2219,7 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
|
|
2123
2219
|
}
|
2124
2220
|
|
2125
2221
|
static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
2126
|
-
|
2127
|
-
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
2128
|
-
#endif
|
2222
|
+
VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
|
2129
2223
|
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
2130
2224
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
2131
2225
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
@@ -2147,9 +2241,7 @@ static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) {
|
|
2147
2241
|
if (ptr == nullptr) {
|
2148
2242
|
return;
|
2149
2243
|
}
|
2150
|
-
|
2151
|
-
std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl;
|
2152
|
-
#endif
|
2244
|
+
VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
|
2153
2245
|
vk_buffer buf;
|
2154
2246
|
size_t index;
|
2155
2247
|
for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
|
@@ -2201,13 +2293,11 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context
|
|
2201
2293
|
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
2202
2294
|
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
2203
2295
|
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
2204
|
-
|
2205
|
-
std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
2296
|
+
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
2206
2297
|
for (auto& buffer : buffers) {
|
2207
2298
|
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
|
2208
2299
|
}
|
2209
|
-
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"
|
2210
|
-
#endif
|
2300
|
+
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
2211
2301
|
std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
|
2212
2302
|
std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
|
2213
2303
|
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
@@ -2240,9 +2330,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
|
|
2240
2330
|
}
|
2241
2331
|
|
2242
2332
|
static void ggml_vk_ctx_end(vk_context * ctx) {
|
2243
|
-
|
2244
|
-
std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl;
|
2245
|
-
#endif
|
2333
|
+
VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
|
2246
2334
|
if (ctx->s == nullptr) {
|
2247
2335
|
return;
|
2248
2336
|
}
|
@@ -2252,9 +2340,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
|
|
2252
2340
|
}
|
2253
2341
|
|
2254
2342
|
static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) {
|
2255
|
-
|
2256
|
-
std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl;
|
2257
|
-
#endif
|
2343
|
+
VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")");
|
2258
2344
|
if (subctx->s != nullptr) {
|
2259
2345
|
ggml_vk_ctx_end(subctx);
|
2260
2346
|
}
|
@@ -2264,9 +2350,7 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
2264
2350
|
}
|
2265
2351
|
|
2266
2352
|
static size_t ggml_vk_align_size(size_t width, size_t align) {
|
2267
|
-
|
2268
|
-
std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl;
|
2269
|
-
#endif
|
2353
|
+
VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
|
2270
2354
|
return CEIL_DIV(width, align) * align;
|
2271
2355
|
}
|
2272
2356
|
|
@@ -2280,6 +2364,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|
2280
2364
|
|
2281
2365
|
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
2282
2366
|
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
2367
|
+
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
|
2283
2368
|
ggml_vk_destroy_buffer(ctx->sync_staging);
|
2284
2369
|
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
2285
2370
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
@@ -2288,9 +2373,7 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si
|
|
2288
2373
|
}
|
2289
2374
|
|
2290
2375
|
static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
|
2291
|
-
|
2292
|
-
std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
|
2293
|
-
#endif
|
2376
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
|
2294
2377
|
GGML_ASSERT(!ggml_is_contiguous(tensor));
|
2295
2378
|
// Buffer is already mapped
|
2296
2379
|
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
@@ -2395,9 +2478,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|
2395
2478
|
}
|
2396
2479
|
|
2397
2480
|
static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
|
2398
|
-
|
2399
|
-
std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
|
2400
|
-
#endif
|
2481
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
|
2401
2482
|
// Make sure ctx owns the buffer
|
2402
2483
|
GGML_ASSERT(dst->ctx == ctx);
|
2403
2484
|
|
@@ -2432,9 +2513,7 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
|
|
2432
2513
|
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
2433
2514
|
return;
|
2434
2515
|
}
|
2435
|
-
|
2436
|
-
std::cerr << "STAGING" << std::endl;
|
2437
|
-
#endif
|
2516
|
+
VK_LOG_DEBUG("STAGING");
|
2438
2517
|
|
2439
2518
|
// Staging buffer required
|
2440
2519
|
vk_buffer staging = ctx->staging;
|
@@ -2469,16 +2548,12 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
|
|
2469
2548
|
}
|
2470
2549
|
|
2471
2550
|
static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
|
2472
|
-
|
2473
|
-
std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
|
2474
|
-
#endif
|
2551
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
|
2475
2552
|
return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging);
|
2476
2553
|
}
|
2477
2554
|
|
2478
2555
|
static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
|
2479
|
-
|
2480
|
-
std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
|
2481
|
-
#endif
|
2556
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
|
2482
2557
|
// Buffer is already mapped
|
2483
2558
|
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
2484
2559
|
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
@@ -2503,16 +2578,12 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
2503
2578
|
}
|
2504
2579
|
|
2505
2580
|
static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) {
|
2506
|
-
|
2507
|
-
std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
|
2508
|
-
#endif
|
2581
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
|
2509
2582
|
ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1);
|
2510
2583
|
}
|
2511
2584
|
|
2512
2585
|
static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
|
2513
|
-
|
2514
|
-
std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
|
2515
|
-
#endif
|
2586
|
+
VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
|
2516
2587
|
GGML_ASSERT(width > 0);
|
2517
2588
|
GGML_ASSERT(height > 0);
|
2518
2589
|
GGML_ASSERT(src != nullptr);
|
@@ -2546,9 +2617,7 @@ static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_conte
|
|
2546
2617
|
|
2547
2618
|
return;
|
2548
2619
|
}
|
2549
|
-
|
2550
|
-
std::cerr << "STAGING" << std::endl;
|
2551
|
-
#endif
|
2620
|
+
VK_LOG_DEBUG("STAGING");
|
2552
2621
|
|
2553
2622
|
// Fall back to staging buffer
|
2554
2623
|
vk_buffer staging = ctx->staging;
|
@@ -2575,9 +2644,7 @@ static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context
|
|
2575
2644
|
}
|
2576
2645
|
|
2577
2646
|
static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) {
|
2578
|
-
|
2579
|
-
std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl;
|
2580
|
-
#endif
|
2647
|
+
VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
|
2581
2648
|
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
2582
2649
|
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
2583
2650
|
|
@@ -2599,9 +2666,7 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
|
|
2599
2666
|
}
|
2600
2667
|
|
2601
2668
|
static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
2602
|
-
|
2603
|
-
std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl;
|
2604
|
-
#endif
|
2669
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
|
2605
2670
|
// Make sure both buffers are on same ctx
|
2606
2671
|
GGML_ASSERT(src->ctx == dst->ctx);
|
2607
2672
|
|
@@ -2612,9 +2677,7 @@ static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t d
|
|
2612
2677
|
|
2613
2678
|
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
2614
2679
|
if (src->ctx == dst->ctx) {
|
2615
|
-
|
2616
|
-
std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl;
|
2617
|
-
#endif
|
2680
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
2618
2681
|
// Copy within the device
|
2619
2682
|
ggml_backend_vk_context * ctx = src->ctx;
|
2620
2683
|
|
@@ -2626,9 +2689,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
2626
2689
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
2627
2690
|
ctx->device->device.resetFences({ ctx->fence });
|
2628
2691
|
} else {
|
2629
|
-
|
2630
|
-
std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
|
2631
|
-
#endif
|
2692
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
2632
2693
|
// Copy device to device
|
2633
2694
|
ggml_backend_vk_context * src_ctx = src->ctx;
|
2634
2695
|
ggml_backend_vk_context * dst_ctx = dst->ctx;
|
@@ -2646,9 +2707,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
2646
2707
|
}
|
2647
2708
|
|
2648
2709
|
static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
2649
|
-
|
2650
|
-
std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
|
2651
|
-
#endif
|
2710
|
+
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
2652
2711
|
// Make sure ctx owns the buffer
|
2653
2712
|
GGML_ASSERT(dst->ctx == ctx);
|
2654
2713
|
|
@@ -2663,9 +2722,7 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst,
|
|
2663
2722
|
}
|
2664
2723
|
|
2665
2724
|
static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
|
2666
|
-
|
2667
|
-
std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
|
2668
|
-
#endif
|
2725
|
+
VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")");
|
2669
2726
|
const uint64_t ne0 = src->ne[0];
|
2670
2727
|
const uint64_t ne1 = src->ne[1];
|
2671
2728
|
const uint64_t nb0 = src->nb[0];
|
@@ -2693,9 +2750,7 @@ static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
|
|
2693
2750
|
}
|
2694
2751
|
|
2695
2752
|
static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) {
|
2696
|
-
|
2697
|
-
std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl;
|
2698
|
-
#endif
|
2753
|
+
VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()");
|
2699
2754
|
const uint64_t ne0 = dst->ne[0];
|
2700
2755
|
const uint64_t ne1 = dst->ne[1];
|
2701
2756
|
const uint64_t ne2 = dst->ne[2];
|
@@ -2719,9 +2774,7 @@ static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
|
|
2719
2774
|
}
|
2720
2775
|
|
2721
2776
|
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
|
2722
|
-
|
2723
|
-
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
|
2724
|
-
#endif
|
2777
|
+
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
|
2725
2778
|
// if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
|
2726
2779
|
// return 4;
|
2727
2780
|
// }
|
@@ -2753,9 +2806,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context *
|
|
2753
2806
|
}
|
2754
2807
|
|
2755
2808
|
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
|
2756
|
-
|
2757
|
-
std::cerr << "ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")" << std::endl;
|
2758
|
-
#endif
|
2809
|
+
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
|
2759
2810
|
switch (ctx->device->vendor_id) {
|
2760
2811
|
case VK_VENDOR_ID_AMD:
|
2761
2812
|
return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned);
|
@@ -2777,9 +2828,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2777
2828
|
}
|
2778
2829
|
|
2779
2830
|
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
|
2780
|
-
|
2781
|
-
std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
|
2782
|
-
#endif
|
2831
|
+
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
|
2783
2832
|
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
|
2784
2833
|
}
|
2785
2834
|
|
@@ -2789,9 +2838,7 @@ static void ggml_vk_matmul(
|
|
2789
2838
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
2790
2839
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
2791
2840
|
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
|
2792
|
-
|
2793
|
-
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")" << std::endl;
|
2794
|
-
#endif
|
2841
|
+
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
|
2795
2842
|
ggml_vk_sync_buffers(subctx);
|
2796
2843
|
if (split_k == 1) {
|
2797
2844
|
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
|
@@ -2815,12 +2862,10 @@ static void ggml_vk_matmul_id(
|
|
2815
2862
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
2816
2863
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
2817
2864
|
uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
|
2818
|
-
|
2819
|
-
std::cerr << "ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
2865
|
+
VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
2820
2866
|
"m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
|
2821
2867
|
"batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
|
2822
|
-
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"
|
2823
|
-
#endif
|
2868
|
+
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
|
2824
2869
|
ggml_vk_sync_buffers(subctx);
|
2825
2870
|
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
2826
2871
|
nei0, nei1, nbi1, ne11 };
|
@@ -2850,10 +2895,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
|
|
2850
2895
|
}
|
2851
2896
|
|
2852
2897
|
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
2853
|
-
|
2854
|
-
std::cerr << "
|
2855
|
-
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
|
2856
|
-
#endif
|
2898
|
+
VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
2899
|
+
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
|
2857
2900
|
const int tensor_type_size = ggml_type_size(tensor->type);
|
2858
2901
|
|
2859
2902
|
const uint32_t ne = ggml_nelements(tensor);
|
@@ -2870,11 +2913,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
|
|
2870
2913
|
}
|
2871
2914
|
|
2872
2915
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2873
|
-
|
2874
|
-
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
2916
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
2875
2917
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
2876
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
2877
|
-
#endif
|
2918
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
2878
2919
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
2879
2920
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
2880
2921
|
|
@@ -2949,7 +2990,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2949
2990
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
2950
2991
|
|
2951
2992
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
2952
|
-
const uint64_t d_buf_offset = extra->offset;
|
2993
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
2953
2994
|
GGML_ASSERT(d_D != nullptr);
|
2954
2995
|
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
2955
2996
|
vk_buffer d_X;
|
@@ -2958,12 +2999,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2958
2999
|
uint64_t y_buf_offset = 0;
|
2959
3000
|
if (!src0_uma) {
|
2960
3001
|
d_Qx = extra_src0->buffer_gpu.lock();
|
2961
|
-
qx_buf_offset = extra_src0->offset;
|
3002
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
2962
3003
|
GGML_ASSERT(d_Qx != nullptr);
|
2963
3004
|
}
|
2964
3005
|
if (!src1_uma) {
|
2965
3006
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2966
|
-
qy_buf_offset = extra_src1->offset;
|
3007
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
2967
3008
|
GGML_ASSERT(d_Qy != nullptr);
|
2968
3009
|
}
|
2969
3010
|
if (qx_needs_dequant) {
|
@@ -3045,11 +3086,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
3045
3086
|
}
|
3046
3087
|
|
3047
3088
|
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3048
|
-
|
3049
|
-
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3089
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3050
3090
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3051
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3052
|
-
#endif
|
3091
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3053
3092
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
3054
3093
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
3055
3094
|
|
@@ -3114,7 +3153,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3114
3153
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
3115
3154
|
|
3116
3155
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3117
|
-
const uint64_t d_buf_offset = extra->offset;
|
3156
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
3118
3157
|
GGML_ASSERT(d_D != nullptr);
|
3119
3158
|
vk_buffer d_X;
|
3120
3159
|
uint64_t x_buf_offset = 0;
|
@@ -3122,12 +3161,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3122
3161
|
uint64_t y_buf_offset = 0;
|
3123
3162
|
if(!src0_uma) {
|
3124
3163
|
d_Qx = extra_src0->buffer_gpu.lock();
|
3125
|
-
qx_buf_offset = extra_src0->offset;
|
3164
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
3126
3165
|
GGML_ASSERT(d_Qx != nullptr);
|
3127
3166
|
}
|
3128
3167
|
if(!src1_uma) {
|
3129
3168
|
d_Qy = extra_src1->buffer_gpu.lock();
|
3130
|
-
qy_buf_offset = extra_src1->offset;
|
3169
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
3131
3170
|
GGML_ASSERT(d_Qy != nullptr);
|
3132
3171
|
}
|
3133
3172
|
if (qx_needs_dequant) {
|
@@ -3200,11 +3239,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3200
3239
|
}
|
3201
3240
|
|
3202
3241
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3203
|
-
|
3204
|
-
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3242
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3205
3243
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3206
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3207
|
-
#endif
|
3244
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3208
3245
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
3209
3246
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
3210
3247
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
@@ -3246,14 +3283,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
3246
3283
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
3247
3284
|
|
3248
3285
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3249
|
-
const uint64_t d_buf_offset = extra->offset;
|
3286
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
3250
3287
|
GGML_ASSERT(d_D != nullptr);
|
3251
3288
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
3252
|
-
const uint64_t qx_buf_offset = extra_src0->offset;
|
3289
|
+
const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
|
3253
3290
|
GGML_ASSERT(d_Qx != nullptr);
|
3254
3291
|
if (!src1_uma) {
|
3255
3292
|
d_Qy = extra_src1->buffer_gpu.lock();
|
3256
|
-
qy_buf_offset = extra_src1->offset;
|
3293
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
3257
3294
|
GGML_ASSERT(d_Qx != nullptr);
|
3258
3295
|
}
|
3259
3296
|
|
@@ -3273,11 +3310,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
3273
3310
|
}
|
3274
3311
|
|
3275
3312
|
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3276
|
-
|
3277
|
-
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3313
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3278
3314
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3279
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3280
|
-
#endif
|
3315
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3281
3316
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
3282
3317
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
3283
3318
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
@@ -3323,14 +3358,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
3323
3358
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
3324
3359
|
|
3325
3360
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3326
|
-
const uint64_t d_buf_offset = extra->offset;
|
3361
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
3327
3362
|
GGML_ASSERT(d_D != nullptr);
|
3328
3363
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
3329
|
-
const uint64_t qx_buf_offset = extra_src0->offset;
|
3364
|
+
const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
|
3330
3365
|
GGML_ASSERT(d_Qx != nullptr);
|
3331
3366
|
if (!src1_uma) {
|
3332
3367
|
d_Qy = extra_src1->buffer_gpu.lock();
|
3333
|
-
qy_buf_offset = extra_src1->offset;
|
3368
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
3334
3369
|
GGML_ASSERT(d_Qx != nullptr);
|
3335
3370
|
}
|
3336
3371
|
|
@@ -3350,9 +3385,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
3350
3385
|
}
|
3351
3386
|
|
3352
3387
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3353
|
-
|
3354
|
-
std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
|
3355
|
-
#endif
|
3388
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
|
3356
3389
|
if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
|
3357
3390
|
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
|
3358
3391
|
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
|
@@ -3365,12 +3398,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
3365
3398
|
}
|
3366
3399
|
|
3367
3400
|
static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
3368
|
-
|
3369
|
-
std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3401
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3370
3402
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3371
3403
|
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
3372
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3373
|
-
#endif
|
3404
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3374
3405
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
3375
3406
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
3376
3407
|
|
@@ -3459,7 +3490,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
3459
3490
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
3460
3491
|
|
3461
3492
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3462
|
-
const uint64_t d_buf_offset = extra->offset;
|
3493
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
3463
3494
|
GGML_ASSERT(d_D != nullptr);
|
3464
3495
|
vk_buffer d_X;
|
3465
3496
|
uint64_t x_buf_offset = 0;
|
@@ -3467,17 +3498,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
3467
3498
|
uint64_t y_buf_offset = 0;
|
3468
3499
|
if (!src0_uma) {
|
3469
3500
|
d_Qx = extra_src0->buffer_gpu.lock();
|
3470
|
-
qx_buf_offset = extra_src0->offset;
|
3501
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
3471
3502
|
GGML_ASSERT(d_Qx != nullptr);
|
3472
3503
|
}
|
3473
3504
|
if (!src1_uma) {
|
3474
3505
|
d_Qy = extra_src1->buffer_gpu.lock();
|
3475
|
-
qy_buf_offset = extra_src1->offset;
|
3506
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
3476
3507
|
GGML_ASSERT(d_Qy != nullptr);
|
3477
3508
|
}
|
3478
3509
|
if (!ids_uma) {
|
3479
3510
|
d_ids = extra_ids->buffer_gpu.lock();
|
3480
|
-
ids_buf_offset = extra_ids->offset;
|
3511
|
+
ids_buf_offset = extra_ids->offset + ids->view_offs;
|
3481
3512
|
GGML_ASSERT(d_ids != nullptr);
|
3482
3513
|
}
|
3483
3514
|
if (qx_needs_dequant) {
|
@@ -3556,12 +3587,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
3556
3587
|
}
|
3557
3588
|
|
3558
3589
|
static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
3559
|
-
|
3560
|
-
std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3590
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3561
3591
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3562
3592
|
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
3563
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3564
|
-
#endif
|
3593
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3565
3594
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
3566
3595
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
3567
3596
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
@@ -3636,7 +3665,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
3636
3665
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
3637
3666
|
|
3638
3667
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3639
|
-
const uint64_t d_buf_offset = extra->offset;
|
3668
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
3640
3669
|
GGML_ASSERT(d_D != nullptr);
|
3641
3670
|
vk_buffer d_X;
|
3642
3671
|
uint64_t x_buf_offset = 0;
|
@@ -3644,17 +3673,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
3644
3673
|
uint64_t y_buf_offset = 0;
|
3645
3674
|
if(!src0_uma) {
|
3646
3675
|
d_Qx = extra_src0->buffer_gpu.lock();
|
3647
|
-
qx_buf_offset = extra_src0->offset;
|
3676
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
3648
3677
|
GGML_ASSERT(d_Qx != nullptr);
|
3649
3678
|
}
|
3650
3679
|
if(!src1_uma) {
|
3651
3680
|
d_Qy = extra_src1->buffer_gpu.lock();
|
3652
|
-
qy_buf_offset = extra_src1->offset;
|
3681
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
3653
3682
|
GGML_ASSERT(d_Qy != nullptr);
|
3654
3683
|
}
|
3655
3684
|
if(!ids_uma) {
|
3656
3685
|
d_ids = extra_ids->buffer_gpu.lock();
|
3657
|
-
ids_buf_offset = extra_ids->offset;
|
3686
|
+
ids_buf_offset = extra_ids->offset + ids->view_offs;
|
3658
3687
|
GGML_ASSERT(d_ids != nullptr);
|
3659
3688
|
}
|
3660
3689
|
if (qx_needs_dequant) {
|
@@ -3724,9 +3753,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
3724
3753
|
}
|
3725
3754
|
|
3726
3755
|
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
3727
|
-
|
3728
|
-
std::cerr << "ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")" << std::endl;
|
3729
|
-
#endif
|
3756
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
|
3730
3757
|
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
3731
3758
|
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
|
3732
3759
|
} else {
|
@@ -3769,9 +3796,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
3769
3796
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
3770
3797
|
|
3771
3798
|
const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
|
3772
|
-
const uint64_t src_offset = extra_src0->offset;
|
3799
|
+
const uint64_t src_offset = extra_src0->offset + src0->view_offs;
|
3773
3800
|
vk_buffer dst_buf = extra->buffer_gpu.lock();
|
3774
|
-
const uint64_t dst_offset = extra->offset;
|
3801
|
+
const uint64_t dst_offset = extra->offset + dst->view_offs;
|
3775
3802
|
|
3776
3803
|
std::vector<vk::BufferCopy> copies;
|
3777
3804
|
|
@@ -3908,10 +3935,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
3908
3935
|
}
|
3909
3936
|
} else {
|
3910
3937
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
3911
|
-
return ctx->device->
|
3938
|
+
return ctx->device->pipeline_rope_norm_f32;
|
3912
3939
|
}
|
3913
3940
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
3914
|
-
return ctx->device->
|
3941
|
+
return ctx->device->pipeline_rope_norm_f16;
|
3915
3942
|
}
|
3916
3943
|
}
|
3917
3944
|
return nullptr;
|
@@ -3960,16 +3987,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
3960
3987
|
|
3961
3988
|
template<typename PC>
|
3962
3989
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
3963
|
-
|
3964
|
-
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3990
|
+
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3965
3991
|
if (src1 != nullptr) {
|
3966
3992
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3967
3993
|
}
|
3968
3994
|
if (src2 != nullptr) {
|
3969
3995
|
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
3970
3996
|
}
|
3971
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")"
|
3972
|
-
#endif
|
3997
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")");
|
3973
3998
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
3974
3999
|
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
3975
4000
|
GGML_ASSERT(dst->extra != nullptr);
|
@@ -4062,21 +4087,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4062
4087
|
}
|
4063
4088
|
|
4064
4089
|
GGML_ASSERT(d_D != nullptr);
|
4065
|
-
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
4090
|
+
uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
4066
4091
|
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
4067
4092
|
if(!src0_uma) {
|
4068
4093
|
d_X = extra_src0->buffer_gpu.lock();
|
4069
|
-
x_buf_offset = extra_src0->offset;
|
4094
|
+
x_buf_offset = extra_src0->offset + src0->view_offs;
|
4070
4095
|
GGML_ASSERT(d_X != nullptr);
|
4071
4096
|
}
|
4072
4097
|
if (use_src1 && !src1_uma) {
|
4073
4098
|
d_Y = extra_src1->buffer_gpu.lock();
|
4074
|
-
y_buf_offset = extra_src1->offset;
|
4099
|
+
y_buf_offset = extra_src1->offset + src1->view_offs;
|
4075
4100
|
GGML_ASSERT(d_Y != nullptr);
|
4076
4101
|
}
|
4077
4102
|
if (use_src2 && !src2_uma) {
|
4078
4103
|
d_Z = extra_src2->buffer_gpu.lock();
|
4079
|
-
z_buf_offset = extra_src2->offset;
|
4104
|
+
z_buf_offset = extra_src2->offset + src2->view_offs;
|
4080
4105
|
GGML_ASSERT(d_Z != nullptr);
|
4081
4106
|
}
|
4082
4107
|
|
@@ -4155,24 +4180,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4155
4180
|
ggml_vk_sync_buffers(subctx);
|
4156
4181
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4157
4182
|
} else if (op == GGML_OP_ROPE) {
|
4158
|
-
|
4159
|
-
|
4160
|
-
|
4161
|
-
|
4162
|
-
// Empty src2 is possible in rope, but the shader needs a buffer
|
4163
|
-
vk_subbuffer subbuf_z;
|
4164
|
-
if (use_src2) {
|
4165
|
-
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
4166
|
-
} else {
|
4167
|
-
subbuf_z = { d_X, 0, d_X->size };
|
4168
|
-
}
|
4169
|
-
|
4170
|
-
ggml_vk_sync_buffers(subctx);
|
4171
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4183
|
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
4184
|
+
vk_subbuffer subbuf_z;
|
4185
|
+
if (use_src2) {
|
4186
|
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
4172
4187
|
} else {
|
4173
|
-
|
4174
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4188
|
+
subbuf_z = { d_X, 0, d_X->size };
|
4175
4189
|
}
|
4190
|
+
|
4191
|
+
ggml_vk_sync_buffers(subctx);
|
4192
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4176
4193
|
} else if (use_src2) {
|
4177
4194
|
ggml_vk_sync_buffers(subctx);
|
4178
4195
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
@@ -4336,7 +4353,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
4336
4353
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
4337
4354
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
4338
4355
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
4339
|
-
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
4356
|
+
const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
4340
4357
|
|
4341
4358
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
4342
4359
|
(uint32_t)ggml_nelements(src0),
|
@@ -4394,7 +4411,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
4394
4411
|
|
4395
4412
|
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
4396
4413
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
4397
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
4414
|
+
// const int mode = ((int32_t *) dst->op_params)[2];
|
4398
4415
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
4399
4416
|
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
4400
4417
|
const float freq_base = ((float *) dst->op_params)[5];
|
@@ -4404,28 +4421,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
4404
4421
|
const float beta_fast = ((float *) dst->op_params)[9];
|
4405
4422
|
const float beta_slow = ((float *) dst->op_params)[10];
|
4406
4423
|
|
4407
|
-
const bool is_neox = mode & 2;
|
4408
|
-
|
4409
|
-
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
4410
|
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
4411
|
-
|
4412
4424
|
float corr_dims[2];
|
4413
4425
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
4414
4426
|
|
4415
|
-
|
4416
|
-
|
4417
|
-
|
4418
|
-
|
4419
|
-
|
4420
|
-
|
4421
|
-
|
4422
|
-
});
|
4423
|
-
} else {
|
4424
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
4425
|
-
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
4426
|
-
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
4427
|
-
});
|
4428
|
-
}
|
4427
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
4428
|
+
|
4429
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
4430
|
+
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
4431
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
|
4432
|
+
src2 != nullptr,
|
4433
|
+
});
|
4429
4434
|
}
|
4430
4435
|
|
4431
4436
|
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
@@ -4487,9 +4492,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
|
|
4487
4492
|
|
4488
4493
|
template <typename X_TYPE, typename Y_TYPE>
|
4489
4494
|
static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
|
4490
|
-
|
4491
|
-
std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl;
|
4492
|
-
#endif
|
4495
|
+
VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")");
|
4493
4496
|
const size_t x_ne = m * k * batch;
|
4494
4497
|
const size_t y_ne = k * n * batch;
|
4495
4498
|
const size_t d_ne = m * n * batch;
|
@@ -4903,9 +4906,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
|
|
4903
4906
|
}
|
4904
4907
|
|
4905
4908
|
static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
|
4906
|
-
|
4907
|
-
std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
|
4908
|
-
#endif
|
4909
|
+
VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
|
4909
4910
|
// Check transfers are correct
|
4910
4911
|
vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
4911
4912
|
|
@@ -4989,9 +4990,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
|
|
4989
4990
|
}
|
4990
4991
|
|
4991
4992
|
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
4992
|
-
|
4993
|
-
std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
|
4994
|
-
#endif
|
4993
|
+
VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
|
4995
4994
|
const size_t x_sz = sizeof(float) * ne;
|
4996
4995
|
const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
|
4997
4996
|
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
@@ -5068,9 +5067,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
5068
5067
|
}
|
5069
5068
|
|
5070
5069
|
static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
|
5071
|
-
|
5072
|
-
std::cerr << "ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")" << std::endl;
|
5073
|
-
#endif
|
5070
|
+
VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
|
5074
5071
|
const size_t x_ne = m * k * batch;
|
5075
5072
|
const size_t y_ne = k * n * batch;
|
5076
5073
|
const size_t d_ne = m * n * batch;
|
@@ -5254,9 +5251,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
5254
5251
|
#endif
|
5255
5252
|
|
5256
5253
|
static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
|
5257
|
-
|
5258
|
-
std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
|
5259
|
-
#endif
|
5254
|
+
VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
|
5260
5255
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
5261
5256
|
extra->reset();
|
5262
5257
|
tensor->extra = extra;
|
@@ -5264,9 +5259,7 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|
5264
5259
|
}
|
5265
5260
|
|
5266
5261
|
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
5267
|
-
|
5268
|
-
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
5269
|
-
#endif
|
5262
|
+
VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
|
5270
5263
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5271
5264
|
|
5272
5265
|
if (extra == nullptr) {
|
@@ -5301,7 +5294,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5301
5294
|
|
5302
5295
|
bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
5303
5296
|
|
5304
|
-
const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
|
5297
|
+
const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
|
5305
5298
|
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
5306
5299
|
|
5307
5300
|
int split_k;
|
@@ -5379,9 +5372,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5379
5372
|
}
|
5380
5373
|
|
5381
5374
|
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
5382
|
-
#ifdef GGML_VULKAN_DEBUG
|
5383
|
-
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
5384
|
-
#endif
|
5385
5375
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
5386
5376
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
5387
5377
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
@@ -5520,6 +5510,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5520
5510
|
#endif
|
5521
5511
|
|
5522
5512
|
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
5513
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
|
5523
5514
|
// Resize buffer
|
5524
5515
|
if (ctx->prealloc_x != nullptr) {
|
5525
5516
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
@@ -5527,6 +5518,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5527
5518
|
ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x);
|
5528
5519
|
}
|
5529
5520
|
if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
|
5521
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")");
|
5530
5522
|
// Resize buffer
|
5531
5523
|
if (ctx->prealloc_y != nullptr) {
|
5532
5524
|
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
@@ -5534,6 +5526,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5534
5526
|
ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y);
|
5535
5527
|
}
|
5536
5528
|
if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
|
5529
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
|
5537
5530
|
// Resize buffer
|
5538
5531
|
if (ctx->prealloc_split_k != nullptr) {
|
5539
5532
|
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
@@ -5541,6 +5534,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5541
5534
|
ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k);
|
5542
5535
|
}
|
5543
5536
|
if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
|
5537
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
|
5544
5538
|
// Resize buffer
|
5545
5539
|
if (ctx->staging != nullptr) {
|
5546
5540
|
ggml_vk_destroy_buffer(ctx->staging);
|
@@ -5558,9 +5552,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5558
5552
|
return;
|
5559
5553
|
}
|
5560
5554
|
|
5561
|
-
|
5562
|
-
std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl;
|
5563
|
-
#endif
|
5555
|
+
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
5564
5556
|
ctx->semaphore_idx = 0;
|
5565
5557
|
ctx->staging_offset = 0;
|
5566
5558
|
|
@@ -5569,6 +5561,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5569
5561
|
const ggml_tensor * src2 = node->src[2];
|
5570
5562
|
|
5571
5563
|
switch (node->op) {
|
5564
|
+
// Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
|
5565
|
+
case GGML_OP_RESHAPE:
|
5566
|
+
case GGML_OP_VIEW:
|
5567
|
+
case GGML_OP_PERMUTE:
|
5568
|
+
case GGML_OP_TRANSPOSE:
|
5569
|
+
case GGML_OP_NONE:
|
5570
|
+
return;
|
5572
5571
|
case GGML_OP_UNARY:
|
5573
5572
|
switch (ggml_get_unary_op(node)) {
|
5574
5573
|
case GGML_UNARY_OP_SILU:
|
@@ -5590,10 +5589,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5590
5589
|
case GGML_OP_CPY:
|
5591
5590
|
case GGML_OP_CONT:
|
5592
5591
|
case GGML_OP_DUP:
|
5593
|
-
case GGML_OP_RESHAPE:
|
5594
|
-
case GGML_OP_VIEW:
|
5595
|
-
case GGML_OP_PERMUTE:
|
5596
|
-
case GGML_OP_TRANSPOSE:
|
5597
5592
|
case GGML_OP_NORM:
|
5598
5593
|
case GGML_OP_RMS_NORM:
|
5599
5594
|
case GGML_OP_DIAG_MASK_INF:
|
@@ -5601,7 +5596,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5601
5596
|
case GGML_OP_ROPE:
|
5602
5597
|
case GGML_OP_MUL_MAT:
|
5603
5598
|
case GGML_OP_MUL_MAT_ID:
|
5604
|
-
case GGML_OP_NONE:
|
5605
5599
|
case GGML_OP_ARGSORT:
|
5606
5600
|
case GGML_OP_SUM_ROWS:
|
5607
5601
|
break;
|
@@ -5654,12 +5648,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5654
5648
|
case GGML_OP_DUP:
|
5655
5649
|
ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
|
5656
5650
|
|
5657
|
-
break;
|
5658
|
-
case GGML_OP_RESHAPE:
|
5659
|
-
case GGML_OP_VIEW:
|
5660
|
-
case GGML_OP_PERMUTE:
|
5661
|
-
case GGML_OP_TRANSPOSE:
|
5662
|
-
case GGML_OP_NONE:
|
5663
5651
|
break;
|
5664
5652
|
case GGML_OP_NORM:
|
5665
5653
|
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
@@ -5712,7 +5700,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5712
5700
|
return;
|
5713
5701
|
}
|
5714
5702
|
|
5715
|
-
extra->ready = true;
|
5716
5703
|
extra->ctx_idx = ctx->compute_ctx->idx;
|
5717
5704
|
|
5718
5705
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
@@ -5788,16 +5775,12 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5788
5775
|
return true;
|
5789
5776
|
}
|
5790
5777
|
|
5791
|
-
|
5792
|
-
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
|
5793
|
-
#endif
|
5778
|
+
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
|
5794
5779
|
|
5795
5780
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
5796
5781
|
ggml_vk_check_results_0(ctx, params, tensor);
|
5797
5782
|
#endif
|
5798
5783
|
|
5799
|
-
GGML_ASSERT(extra->ready);
|
5800
|
-
|
5801
5784
|
vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
|
5802
5785
|
|
5803
5786
|
// Only run if ctx hasn't been submitted yet
|
@@ -5822,16 +5805,12 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5822
5805
|
subctx.out_memcpys.clear();
|
5823
5806
|
}
|
5824
5807
|
|
5825
|
-
extra->ready = false;
|
5826
|
-
|
5827
5808
|
return true;
|
5828
5809
|
}
|
5829
5810
|
|
5830
5811
|
// Clean up after graph processing is done
|
5831
5812
|
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
5832
|
-
|
5833
|
-
std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
|
5834
|
-
#endif
|
5813
|
+
VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
|
5835
5814
|
for (auto& buffer : ctx->gc.temp_buffers) {
|
5836
5815
|
ggml_vk_pool_free(ctx, buffer);
|
5837
5816
|
}
|
@@ -5875,9 +5854,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
5875
5854
|
|
5876
5855
|
// Clean up on backend free
|
5877
5856
|
static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
5878
|
-
|
5879
|
-
std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl;
|
5880
|
-
#endif
|
5857
|
+
VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")");
|
5881
5858
|
ggml_vk_graph_cleanup(ctx);
|
5882
5859
|
|
5883
5860
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
@@ -5943,7 +5920,9 @@ struct ggml_backend_vk_buffer_context {
|
|
5943
5920
|
|
5944
5921
|
~ggml_backend_vk_buffer_context() {
|
5945
5922
|
ggml_vk_destroy_buffer(dev_buffer);
|
5946
|
-
|
5923
|
+
if (temp_tensor_extras != nullptr) {
|
5924
|
+
delete[] temp_tensor_extras;
|
5925
|
+
}
|
5947
5926
|
}
|
5948
5927
|
|
5949
5928
|
ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
|
@@ -5970,9 +5949,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
|
5970
5949
|
}
|
5971
5950
|
|
5972
5951
|
GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
5973
|
-
|
5974
|
-
std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl;
|
5975
|
-
#endif
|
5952
|
+
VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
|
5976
5953
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
5977
5954
|
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
5978
5955
|
delete ctx;
|
@@ -5985,49 +5962,41 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu
|
|
5985
5962
|
}
|
5986
5963
|
|
5987
5964
|
GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
5988
|
-
|
5989
|
-
std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl;
|
5990
|
-
#endif
|
5965
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
|
5991
5966
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
5992
5967
|
|
5993
|
-
|
5994
|
-
if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
|
5968
|
+
if (tensor->view_src != nullptr) {
|
5995
5969
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
5996
|
-
|
5997
|
-
extra
|
5998
|
-
extra->offset = extra_view->offset + tensor->view_offs;
|
5970
|
+
GGML_ASSERT(tensor->view_src->extra != nullptr);
|
5971
|
+
tensor->extra = tensor->view_src->extra;
|
5999
5972
|
} else {
|
5973
|
+
ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
|
6000
5974
|
extra->buffer_gpu = ctx->dev_buffer;
|
6001
5975
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
5976
|
+
tensor->extra = extra;
|
6002
5977
|
}
|
6003
|
-
|
6004
|
-
tensor->extra = extra;
|
6005
5978
|
}
|
6006
5979
|
|
6007
5980
|
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
6008
|
-
|
6009
|
-
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
6010
|
-
#endif
|
5981
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
6011
5982
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
6012
5983
|
|
6013
5984
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6014
5985
|
|
6015
5986
|
vk_buffer buf = extra->buffer_gpu.lock();
|
6016
5987
|
|
6017
|
-
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
|
5988
|
+
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
6018
5989
|
}
|
6019
5990
|
|
6020
5991
|
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
6021
|
-
|
6022
|
-
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
6023
|
-
#endif
|
5992
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
6024
5993
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
6025
5994
|
|
6026
5995
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6027
5996
|
|
6028
5997
|
vk_buffer buf = extra->buffer_gpu.lock();
|
6029
5998
|
|
6030
|
-
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
|
5999
|
+
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
6031
6000
|
}
|
6032
6001
|
|
6033
6002
|
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
@@ -6038,7 +6007,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|
6038
6007
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
6039
6008
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
6040
6009
|
|
6041
|
-
ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
6010
|
+
ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
|
6042
6011
|
|
6043
6012
|
return true;
|
6044
6013
|
}
|
@@ -6078,11 +6047,15 @@ GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buff
|
|
6078
6047
|
}
|
6079
6048
|
|
6080
6049
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
6081
|
-
|
6082
|
-
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
6083
|
-
#endif
|
6050
|
+
VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
|
6084
6051
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
6085
|
-
|
6052
|
+
|
6053
|
+
vk_buffer dev_buffer = nullptr;
|
6054
|
+
try {
|
6055
|
+
dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
|
6056
|
+
} catch (const vk::SystemError& e) {
|
6057
|
+
return nullptr;
|
6058
|
+
}
|
6086
6059
|
|
6087
6060
|
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
|
6088
6061
|
|
@@ -6105,33 +6078,19 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
|
|
6105
6078
|
UNUSED(buft);
|
6106
6079
|
}
|
6107
6080
|
|
6108
|
-
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
6109
|
-
if (!ggml_backend_is_vk(backend)) {
|
6110
|
-
return false;
|
6111
|
-
}
|
6112
|
-
|
6113
|
-
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
6114
|
-
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6115
|
-
|
6116
|
-
return buft_ctx->ctx->idx == ctx->idx;
|
6117
|
-
}
|
6118
|
-
|
6119
6081
|
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
6120
6082
|
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
6121
6083
|
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
6122
6084
|
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
6123
6085
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
6124
6086
|
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
6125
|
-
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
|
6126
6087
|
/* .is_host = */ NULL,
|
6127
6088
|
};
|
6128
6089
|
|
6129
6090
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
6130
6091
|
ggml_vk_instance_init();
|
6131
6092
|
|
6132
|
-
|
6133
|
-
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
6134
|
-
#endif
|
6093
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
|
6135
6094
|
|
6136
6095
|
GGML_ASSERT(dev_num < vk_instance.device_indices.size());
|
6137
6096
|
|
@@ -6155,16 +6114,12 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff
|
|
6155
6114
|
}
|
6156
6115
|
|
6157
6116
|
GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
6158
|
-
|
6159
|
-
std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl;
|
6160
|
-
#endif
|
6117
|
+
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
6161
6118
|
ggml_vk_host_free(&vk_instance.contexts[0], buffer->context);
|
6162
6119
|
}
|
6163
6120
|
|
6164
6121
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
6165
|
-
|
6166
|
-
std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
6167
|
-
#endif
|
6122
|
+
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
|
6168
6123
|
size += 32; // Behave like the CPU buffer type
|
6169
6124
|
void * ptr = nullptr;
|
6170
6125
|
try {
|
@@ -6198,7 +6153,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
6198
6153
|
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
6199
6154
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
6200
6155
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
6201
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
6202
6156
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
6203
6157
|
},
|
6204
6158
|
/* .context = */ nullptr,
|
@@ -6222,9 +6176,7 @@ GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
|
|
6222
6176
|
|
6223
6177
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
|
6224
6178
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6225
|
-
|
6226
|
-
std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl;
|
6227
|
-
#endif
|
6179
|
+
VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
|
6228
6180
|
|
6229
6181
|
size_t idx = ctx->idx;
|
6230
6182
|
|
@@ -6248,9 +6200,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t
|
|
6248
6200
|
}
|
6249
6201
|
|
6250
6202
|
GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
6251
|
-
|
6252
|
-
std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
|
6253
|
-
#endif
|
6203
|
+
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
6254
6204
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6255
6205
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6256
6206
|
|
@@ -6264,13 +6214,11 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
6264
6214
|
|
6265
6215
|
vk_buffer buf = extra->buffer_gpu.lock();
|
6266
6216
|
|
6267
|
-
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
|
6217
|
+
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
6268
6218
|
}
|
6269
6219
|
|
6270
6220
|
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
6271
|
-
|
6272
|
-
std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
|
6273
|
-
#endif
|
6221
|
+
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
6274
6222
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6275
6223
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6276
6224
|
|
@@ -6284,13 +6232,11 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
6284
6232
|
|
6285
6233
|
vk_buffer buf = extra->buffer_gpu.lock();
|
6286
6234
|
|
6287
|
-
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
|
6235
|
+
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
6288
6236
|
}
|
6289
6237
|
|
6290
6238
|
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
6291
|
-
|
6292
|
-
std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
|
6293
|
-
#endif
|
6239
|
+
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
|
6294
6240
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6295
6241
|
if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
6296
6242
|
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
@@ -6305,7 +6251,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
6305
6251
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
6306
6252
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
6307
6253
|
|
6308
|
-
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
6254
|
+
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
|
6309
6255
|
return true;
|
6310
6256
|
}
|
6311
6257
|
|
@@ -6313,9 +6259,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
6313
6259
|
}
|
6314
6260
|
|
6315
6261
|
GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
6316
|
-
|
6317
|
-
std::cerr << "ggml_backend_vk_synchronize()" << std::endl;
|
6318
|
-
#endif
|
6262
|
+
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
|
6319
6263
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6320
6264
|
if(ctx->transfer_ctx == nullptr) {
|
6321
6265
|
return;
|
@@ -6343,9 +6287,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
|
|
6343
6287
|
}
|
6344
6288
|
|
6345
6289
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
6346
|
-
|
6347
|
-
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
6348
|
-
#endif
|
6290
|
+
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
6349
6291
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6350
6292
|
|
6351
6293
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
@@ -6402,7 +6344,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
6402
6344
|
case GGML_UNARY_OP_GELU:
|
6403
6345
|
case GGML_UNARY_OP_SILU:
|
6404
6346
|
case GGML_UNARY_OP_RELU:
|
6405
|
-
return
|
6347
|
+
return ggml_is_contiguous(op->src[0]);
|
6406
6348
|
default:
|
6407
6349
|
return false;
|
6408
6350
|
}
|
@@ -6478,11 +6420,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
6478
6420
|
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
6479
6421
|
// } break;
|
6480
6422
|
case GGML_OP_ROPE:
|
6481
|
-
|
6482
|
-
const int mode = ((const int32_t *) op->op_params)[2];
|
6483
|
-
|
6484
|
-
return true;
|
6485
|
-
} break;
|
6423
|
+
return ggml_is_contiguous(op->src[0]);
|
6486
6424
|
case GGML_OP_NONE:
|
6487
6425
|
case GGML_OP_RESHAPE:
|
6488
6426
|
case GGML_OP_VIEW:
|
@@ -6518,6 +6456,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
|
|
6518
6456
|
UNUSED(backend);
|
6519
6457
|
}
|
6520
6458
|
|
6459
|
+
GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
6460
|
+
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
6461
|
+
return false;
|
6462
|
+
}
|
6463
|
+
|
6464
|
+
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
6465
|
+
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6466
|
+
|
6467
|
+
return buft_ctx->ctx->idx == ctx->idx;
|
6468
|
+
}
|
6469
|
+
|
6521
6470
|
// TODO: enable async and synchronize
|
6522
6471
|
static ggml_backend_i ggml_backend_vk_interface = {
|
6523
6472
|
/* .get_name = */ ggml_backend_vk_name,
|
@@ -6529,9 +6478,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
6529
6478
|
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
6530
6479
|
/* .graph_plan_create = */ NULL,
|
6531
6480
|
/* .graph_plan_free = */ NULL,
|
6481
|
+
/* .graph_plan_update = */ NULL,
|
6532
6482
|
/* .graph_plan_compute = */ NULL,
|
6533
6483
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
6534
6484
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
6485
|
+
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
6535
6486
|
/* .offload_op = */ ggml_backend_vk_offload_op,
|
6536
6487
|
/* .event_new = */ NULL,
|
6537
6488
|
/* .event_free = */ NULL,
|
@@ -6549,9 +6500,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
|
6549
6500
|
if (vk_instance.initialized[dev_num]) {
|
6550
6501
|
return vk_instance.backends[dev_num];
|
6551
6502
|
}
|
6552
|
-
|
6553
|
-
std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
|
6554
|
-
#endif
|
6503
|
+
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
6555
6504
|
|
6556
6505
|
ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
|
6557
6506
|
ggml_vk_init(ctx, dev_num);
|
@@ -6725,7 +6674,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
6725
6674
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6726
6675
|
|
6727
6676
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6728
|
-
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
6677
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
6729
6678
|
}
|
6730
6679
|
|
6731
6680
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
@@ -6767,9 +6716,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6767
6716
|
return;
|
6768
6717
|
}
|
6769
6718
|
|
6770
|
-
|
6771
|
-
std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
|
6772
|
-
#endif
|
6719
|
+
VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
|
6773
6720
|
|
6774
6721
|
ggml_tensor * src0 = tensor->src[0];
|
6775
6722
|
ggml_tensor * src1 = tensor->src[1];
|
@@ -6809,7 +6756,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6809
6756
|
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
6810
6757
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6811
6758
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6812
|
-
uint64_t offset = extra->offset;
|
6759
|
+
uint64_t offset = extra->offset + src0->view_offs;
|
6813
6760
|
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
6814
6761
|
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
6815
6762
|
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
@@ -6851,7 +6798,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6851
6798
|
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
6852
6799
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6853
6800
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6854
|
-
uint64_t offset = extra->offset;
|
6801
|
+
uint64_t offset = extra->offset + src1->view_offs;
|
6855
6802
|
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
6856
6803
|
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
6857
6804
|
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
@@ -6909,7 +6856,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6909
6856
|
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
6910
6857
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
6911
6858
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6912
|
-
uint64_t offset = extra->offset;
|
6859
|
+
uint64_t offset = extra->offset + src2->view_offs;
|
6913
6860
|
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
6914
6861
|
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
6915
6862
|
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
@@ -7075,9 +7022,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7075
7022
|
return;
|
7076
7023
|
}
|
7077
7024
|
|
7078
|
-
|
7079
|
-
std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
|
7080
|
-
#endif
|
7025
|
+
VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")");
|
7081
7026
|
|
7082
7027
|
ggml_tensor * src0 = tensor->src[0];
|
7083
7028
|
ggml_tensor * src1 = tensor->src[1];
|
@@ -7092,11 +7037,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7092
7037
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7093
7038
|
|
7094
7039
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
7095
|
-
if (extra->offset + tensor_size >= buffer_gpu->size) {
|
7096
|
-
tensor_size = buffer_gpu->size - (extra->offset);
|
7040
|
+
if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
|
7041
|
+
tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
|
7097
7042
|
}
|
7098
7043
|
|
7099
|
-
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
7044
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
7100
7045
|
}
|
7101
7046
|
|
7102
7047
|
float first_error_result = -1.0f;
|