llama_cpp 0.16.0 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +119 -54
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
- data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
- data/vendor/tmp/llama.cpp/ggml.c +158 -414
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +628 -279
- data/vendor/tmp/llama.cpp/llama.h +9 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +15 -3
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#include "ggml-vulkan.h"
|
|
2
|
-
|
|
2
|
+
#include <vulkan/vulkan_core.h>
|
|
3
3
|
#ifdef GGML_VULKAN_RUN_TESTS
|
|
4
4
|
#include <chrono>
|
|
5
5
|
#endif
|
|
@@ -8,13 +8,15 @@
|
|
|
8
8
|
|
|
9
9
|
#include <algorithm>
|
|
10
10
|
#include <cmath>
|
|
11
|
+
#include <iomanip>
|
|
11
12
|
#include <iostream>
|
|
12
|
-
#include <limits>
|
|
13
13
|
#include <tuple>
|
|
14
14
|
#include <vector>
|
|
15
15
|
#include <sstream>
|
|
16
16
|
#include <utility>
|
|
17
17
|
#include <memory>
|
|
18
|
+
#include <limits>
|
|
19
|
+
#include <map>
|
|
18
20
|
|
|
19
21
|
#include "ggml.h"
|
|
20
22
|
#include "ggml-backend-impl.h"
|
|
@@ -56,6 +58,12 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
|
56
58
|
} \
|
|
57
59
|
} while (0)
|
|
58
60
|
|
|
61
|
+
#ifdef GGML_VULKAN_DEBUG
|
|
62
|
+
#define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl
|
|
63
|
+
#else
|
|
64
|
+
#define VK_LOG_DEBUG(msg) ((void) 0)
|
|
65
|
+
#endif // GGML_VULKAN_DEBUG
|
|
66
|
+
|
|
59
67
|
struct ggml_backend_vk_context;
|
|
60
68
|
|
|
61
69
|
struct vk_queue {
|
|
@@ -150,7 +158,7 @@ struct vk_device {
|
|
|
150
158
|
vk_pipeline pipeline_relu_f32;
|
|
151
159
|
vk_pipeline pipeline_diag_mask_inf_f32;
|
|
152
160
|
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
|
153
|
-
vk_pipeline
|
|
161
|
+
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
|
|
154
162
|
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
|
155
163
|
vk_pipeline pipeline_argsort_f32;
|
|
156
164
|
vk_pipeline pipeline_sum_rows_f32;
|
|
@@ -158,9 +166,7 @@ struct vk_device {
|
|
|
158
166
|
std::vector<vk_pipeline_ref> pipelines;
|
|
159
167
|
|
|
160
168
|
~vk_device() {
|
|
161
|
-
|
|
162
|
-
std::cerr << "destroy device " << name << std::endl;
|
|
163
|
-
#endif
|
|
169
|
+
VK_LOG_DEBUG("destroy device " << name);
|
|
164
170
|
device.destroyCommandPool(compute_queue.pool);
|
|
165
171
|
if (!single_queue) {
|
|
166
172
|
device.destroyCommandPool(transfer_queue.pool);
|
|
@@ -195,9 +201,7 @@ struct vk_buffer_struct {
|
|
|
195
201
|
if (size == 0) {
|
|
196
202
|
return;
|
|
197
203
|
}
|
|
198
|
-
|
|
199
|
-
std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl;
|
|
200
|
-
#endif
|
|
204
|
+
VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")");
|
|
201
205
|
|
|
202
206
|
device->device.freeMemory(device_memory);
|
|
203
207
|
device->device.destroyBuffer(buffer);
|
|
@@ -283,26 +287,15 @@ struct vk_op_diag_mask_push_constants {
|
|
|
283
287
|
|
|
284
288
|
struct vk_op_rope_push_constants {
|
|
285
289
|
uint32_t ncols;
|
|
290
|
+
uint32_t n_dims;
|
|
286
291
|
float freq_scale;
|
|
287
292
|
uint32_t p_delta_rows;
|
|
288
293
|
float freq_base;
|
|
289
294
|
float ext_factor;
|
|
290
295
|
float attn_factor;
|
|
291
|
-
float corr_dims[
|
|
292
|
-
};
|
|
293
|
-
|
|
294
|
-
struct vk_op_rope_neox_push_constants {
|
|
295
|
-
uint32_t ncols;
|
|
296
|
-
uint32_t ndims;
|
|
297
|
-
float freq_scale;
|
|
298
|
-
uint32_t p_delta_rows;
|
|
299
|
-
float freq_base;
|
|
300
|
-
float ext_factor;
|
|
301
|
-
float attn_factor;
|
|
302
|
-
float corr_dims[4];
|
|
296
|
+
float corr_dims[2];
|
|
303
297
|
float theta_scale;
|
|
304
|
-
|
|
305
|
-
uint32_t has_freq_facs;
|
|
298
|
+
uint32_t has_ff;
|
|
306
299
|
};
|
|
307
300
|
|
|
308
301
|
struct vk_op_soft_max_push_constants {
|
|
@@ -345,15 +338,12 @@ struct vk_context {
|
|
|
345
338
|
};
|
|
346
339
|
|
|
347
340
|
struct ggml_tensor_extra_gpu {
|
|
348
|
-
bool ready;
|
|
349
|
-
|
|
350
341
|
size_t ctx_idx;
|
|
351
342
|
|
|
352
343
|
vk_buffer_ref buffer_gpu;
|
|
353
344
|
uint64_t offset;
|
|
354
345
|
|
|
355
346
|
void reset() {
|
|
356
|
-
ready = false;
|
|
357
347
|
ctx_idx = 0;
|
|
358
348
|
buffer_gpu.reset();
|
|
359
349
|
offset = 0;
|
|
@@ -368,6 +358,49 @@ struct ggml_vk_garbage_collector {
|
|
|
368
358
|
std::vector<vk_context> contexts;
|
|
369
359
|
};
|
|
370
360
|
|
|
361
|
+
#if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
|
|
362
|
+
#include <mutex>
|
|
363
|
+
|
|
364
|
+
#define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
|
|
365
|
+
|
|
366
|
+
static std::string format_size(size_t size) {
|
|
367
|
+
const size_t kib = 1024;
|
|
368
|
+
const size_t mib = kib * 1024;
|
|
369
|
+
const size_t gib = mib * 1024;
|
|
370
|
+
|
|
371
|
+
std::ostringstream oss;
|
|
372
|
+
oss << std::fixed << std::setprecision(2);
|
|
373
|
+
|
|
374
|
+
if (size >= gib) {
|
|
375
|
+
oss << static_cast<double>(size) / gib << " GiB";
|
|
376
|
+
} else if (size >= mib) {
|
|
377
|
+
oss << static_cast<double>(size) / mib << " MiB";
|
|
378
|
+
} else if (size >= kib) {
|
|
379
|
+
oss << static_cast<double>(size) / kib << " KiB";
|
|
380
|
+
} else {
|
|
381
|
+
oss << size << " B";
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
return oss.str();
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
static std::mutex log_mutex;
|
|
388
|
+
|
|
389
|
+
class vk_memory_logger {
|
|
390
|
+
public:
|
|
391
|
+
vk_memory_logger(): total_device(0), total_host(0) {}
|
|
392
|
+
void log_allocation(vk_buffer_ref buf_ref, size_t size);
|
|
393
|
+
void log_deallocation(vk_buffer_ref buf_ref);
|
|
394
|
+
|
|
395
|
+
private:
|
|
396
|
+
std::map<vk::Buffer, size_t> allocations; // Track allocations
|
|
397
|
+
size_t total_device;
|
|
398
|
+
size_t total_host;
|
|
399
|
+
};
|
|
400
|
+
#else
|
|
401
|
+
#define VK_LOG_MEMORY(msg) ((void) 0)
|
|
402
|
+
#endif // GGML_VULKAN_MEMORY_DEBUG
|
|
403
|
+
|
|
371
404
|
struct ggml_backend_vk_context {
|
|
372
405
|
std::string name;
|
|
373
406
|
|
|
@@ -392,8 +425,45 @@ struct ggml_backend_vk_context {
|
|
|
392
425
|
bool initialized;
|
|
393
426
|
|
|
394
427
|
size_t idx;
|
|
428
|
+
|
|
429
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
430
|
+
vk_memory_logger memory_logger;
|
|
431
|
+
#endif
|
|
395
432
|
};
|
|
396
433
|
|
|
434
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
435
|
+
void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
|
|
436
|
+
std::lock_guard<std::mutex> guard(log_mutex);
|
|
437
|
+
vk_buffer buf = buf_ref.lock();
|
|
438
|
+
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
439
|
+
const std::string type = device ? "device" : "host";
|
|
440
|
+
allocations[buf->buffer] = size;
|
|
441
|
+
total_device += device ? size : 0;
|
|
442
|
+
total_host += device ? 0 : size;
|
|
443
|
+
VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
|
|
447
|
+
if (buf_ref.expired() || buf_ref.lock()->size == 0) {
|
|
448
|
+
return;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
std::lock_guard<std::mutex> guard(log_mutex);
|
|
452
|
+
vk_buffer buf = buf_ref.lock();
|
|
453
|
+
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
454
|
+
std::string type = device ? "device" : "host";
|
|
455
|
+
auto it = allocations.find(buf->buffer);
|
|
456
|
+
total_device -= device ? it->second : 0;
|
|
457
|
+
total_host -= device ? 0 : it->second;
|
|
458
|
+
if (it != allocations.end()) {
|
|
459
|
+
VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
|
|
460
|
+
allocations.erase(it);
|
|
461
|
+
} else {
|
|
462
|
+
VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
#endif // GGML_VULKAN_MEMORY_DEBUG
|
|
466
|
+
|
|
397
467
|
struct vk_instance_t {
|
|
398
468
|
vk::Instance instance;
|
|
399
469
|
|
|
@@ -406,15 +476,11 @@ struct vk_instance_t {
|
|
|
406
476
|
};
|
|
407
477
|
|
|
408
478
|
static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
|
|
409
|
-
|
|
410
|
-
std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl;
|
|
411
|
-
#endif
|
|
479
|
+
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
|
|
412
480
|
static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
|
|
413
481
|
|
|
414
482
|
if (devices[idx].expired()) {
|
|
415
|
-
|
|
416
|
-
std::cerr << "Initializing new vk_device" << std::endl;
|
|
417
|
-
#endif
|
|
483
|
+
VK_LOG_DEBUG("Initializing new vk_device");
|
|
418
484
|
std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
|
|
419
485
|
device->initialized = false;
|
|
420
486
|
devices[idx] = device;
|
|
@@ -441,9 +507,7 @@ static vk_instance_t vk_instance;
|
|
|
441
507
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
|
442
508
|
|
|
443
509
|
static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
|
|
444
|
-
|
|
445
|
-
std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
|
|
446
|
-
#endif
|
|
510
|
+
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
|
447
511
|
GGML_ASSERT(parameter_count > 0);
|
|
448
512
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
|
449
513
|
|
|
@@ -544,9 +608,7 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
|
|
|
544
608
|
}
|
|
545
609
|
|
|
546
610
|
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
|
547
|
-
|
|
548
|
-
std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl;
|
|
549
|
-
#endif
|
|
611
|
+
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
|
|
550
612
|
for (auto& pool : pipeline->descriptor_pools) {
|
|
551
613
|
device.destroyDescriptorPool(pool);
|
|
552
614
|
}
|
|
@@ -564,9 +626,7 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
|
|
564
626
|
}
|
|
565
627
|
|
|
566
628
|
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
|
|
567
|
-
|
|
568
|
-
std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl;
|
|
569
|
-
#endif
|
|
629
|
+
VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
|
570
630
|
if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
|
|
571
631
|
// Enough descriptors are available
|
|
572
632
|
return;
|
|
@@ -596,16 +656,12 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
|
|
|
596
656
|
}
|
|
597
657
|
|
|
598
658
|
static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
|
|
599
|
-
|
|
600
|
-
std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl;
|
|
601
|
-
#endif
|
|
659
|
+
VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
|
|
602
660
|
pipeline->descriptor_set_idx = 0;
|
|
603
661
|
}
|
|
604
662
|
|
|
605
663
|
static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
|
|
606
|
-
|
|
607
|
-
std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl;
|
|
608
|
-
#endif
|
|
664
|
+
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
|
609
665
|
if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
|
|
610
666
|
// Reuse command buffer
|
|
611
667
|
return q.cmd_buffers[q.cmd_buffer_idx++];
|
|
@@ -625,9 +681,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx
|
|
|
625
681
|
}
|
|
626
682
|
|
|
627
683
|
static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
|
628
|
-
|
|
629
|
-
std::cerr << "ggml_vk_create_submission()" << std::endl;
|
|
630
|
-
#endif
|
|
684
|
+
VK_LOG_DEBUG("ggml_vk_create_submission()");
|
|
631
685
|
vk_submission s;
|
|
632
686
|
s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
|
|
633
687
|
s.wait_semaphores = std::move(wait_semaphores);
|
|
@@ -636,9 +690,7 @@ static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk
|
|
|
636
690
|
}
|
|
637
691
|
|
|
638
692
|
static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
|
|
639
|
-
|
|
640
|
-
std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl;
|
|
641
|
-
#endif
|
|
693
|
+
VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
|
|
642
694
|
if (ctx->seqs.empty()) {
|
|
643
695
|
return;
|
|
644
696
|
}
|
|
@@ -712,9 +764,7 @@ static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
|
|
|
712
764
|
}
|
|
713
765
|
|
|
714
766
|
static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
|
|
715
|
-
|
|
716
|
-
std::cerr << "ggml_vk_find_queue_family_index()" << std::endl;
|
|
717
|
-
#endif
|
|
767
|
+
VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
|
|
718
768
|
const uint32_t qfsize = queue_family_props.size();
|
|
719
769
|
|
|
720
770
|
// Try with avoid preferences first
|
|
@@ -760,9 +810,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
|
|
760
810
|
}
|
|
761
811
|
|
|
762
812
|
static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
|
|
763
|
-
|
|
764
|
-
std::cerr << "ggml_vk_create_queue()" << std::endl;
|
|
765
|
-
#endif
|
|
813
|
+
VK_LOG_DEBUG("ggml_vk_create_queue()");
|
|
766
814
|
q.queue_family_index = queue_family_index;
|
|
767
815
|
|
|
768
816
|
vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
|
|
@@ -776,9 +824,7 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin
|
|
|
776
824
|
}
|
|
777
825
|
|
|
778
826
|
static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
|
|
779
|
-
|
|
780
|
-
std::cerr << "ggml_vk_create_context()" << std::endl;
|
|
781
|
-
#endif
|
|
827
|
+
VK_LOG_DEBUG("ggml_vk_create_context()");
|
|
782
828
|
ctx->gc.contexts.emplace_back();
|
|
783
829
|
vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
|
|
784
830
|
memset((void *) result, 0, sizeof(vk_context));
|
|
@@ -788,9 +834,7 @@ static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_que
|
|
|
788
834
|
}
|
|
789
835
|
|
|
790
836
|
static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
|
|
791
|
-
|
|
792
|
-
std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
|
|
793
|
-
#endif
|
|
837
|
+
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
|
|
794
838
|
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
|
|
795
839
|
vk::SemaphoreCreateInfo ci{};
|
|
796
840
|
ci.setPNext(&tci);
|
|
@@ -800,9 +844,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context *
|
|
|
800
844
|
}
|
|
801
845
|
|
|
802
846
|
static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
|
|
803
|
-
|
|
804
|
-
std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
|
|
805
|
-
#endif
|
|
847
|
+
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
|
|
806
848
|
if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
|
|
807
849
|
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
|
|
808
850
|
vk::SemaphoreCreateInfo ci{};
|
|
@@ -821,9 +863,7 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
|
|
|
821
863
|
}
|
|
822
864
|
|
|
823
865
|
static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
|
824
|
-
|
|
825
|
-
std::cerr << "ggml_vk_queue_cleanup()" << std::endl;
|
|
826
|
-
#endif
|
|
866
|
+
VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
|
|
827
867
|
// Requires command buffers to be done
|
|
828
868
|
|
|
829
869
|
ctx->device->device.resetCommandPool(q.pool);
|
|
@@ -843,9 +883,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|
|
843
883
|
}
|
|
844
884
|
|
|
845
885
|
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
|
846
|
-
|
|
847
|
-
std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
|
848
|
-
#endif
|
|
886
|
+
VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
|
|
849
887
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
|
850
888
|
|
|
851
889
|
if (size == 0) {
|
|
@@ -905,8 +943,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
|
905
943
|
|
|
906
944
|
buf->device = ctx->device;
|
|
907
945
|
|
|
908
|
-
#ifdef
|
|
909
|
-
|
|
946
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
947
|
+
ctx->memory_logger.log_allocation(buf, size);
|
|
910
948
|
#endif
|
|
911
949
|
|
|
912
950
|
return buf;
|
|
@@ -941,6 +979,14 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz
|
|
|
941
979
|
}
|
|
942
980
|
|
|
943
981
|
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
|
982
|
+
if (buf == nullptr) {
|
|
983
|
+
return;
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
987
|
+
buf->ctx->memory_logger.log_deallocation(buf);
|
|
988
|
+
#endif
|
|
989
|
+
|
|
944
990
|
buf.reset();
|
|
945
991
|
}
|
|
946
992
|
|
|
@@ -949,9 +995,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
|
|
949
995
|
}
|
|
950
996
|
|
|
951
997
|
static void ggml_vk_sync_buffers(vk_context * ctx) {
|
|
952
|
-
|
|
953
|
-
std::cerr << "ggml_vk_sync_buffers()" << std::endl;
|
|
954
|
-
#endif
|
|
998
|
+
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
|
955
999
|
const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
|
|
956
1000
|
|
|
957
1001
|
ctx->s->buffer.pipelineBarrier(
|
|
@@ -965,9 +1009,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) {
|
|
|
965
1009
|
}
|
|
966
1010
|
|
|
967
1011
|
static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
|
|
968
|
-
|
|
969
|
-
std::cerr << "ggml_vk_wait_events()" << std::endl;
|
|
970
|
-
#endif
|
|
1012
|
+
VK_LOG_DEBUG("ggml_vk_wait_events()");
|
|
971
1013
|
if (events.empty()) {
|
|
972
1014
|
return;
|
|
973
1015
|
}
|
|
@@ -1002,9 +1044,7 @@ static bool ggml_vk_build_shader(ggml_type type) {
|
|
|
1002
1044
|
}
|
|
1003
1045
|
|
|
1004
1046
|
static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1005
|
-
|
|
1006
|
-
std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
|
|
1007
|
-
#endif
|
|
1047
|
+
VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")");
|
|
1008
1048
|
|
|
1009
1049
|
const std::shared_ptr<vk_device> device = ctx->device;
|
|
1010
1050
|
|
|
@@ -1055,12 +1095,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1055
1095
|
ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1056
1096
|
|
|
1057
1097
|
if (device->fp16) {
|
|
1058
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l",
|
|
1059
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m",
|
|
1060
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s",
|
|
1061
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l",
|
|
1062
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m",
|
|
1063
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s",
|
|
1098
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1099
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
|
1100
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
|
1101
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
|
1102
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
|
1103
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
|
1064
1104
|
|
|
1065
1105
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1066
1106
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
|
@@ -1153,12 +1193,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1153
1193
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1154
1194
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1155
1195
|
|
|
1156
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l",
|
|
1157
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m",
|
|
1158
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s",
|
|
1159
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l",
|
|
1160
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m",
|
|
1161
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s",
|
|
1196
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1197
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
|
1198
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
1199
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
|
1200
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
|
1201
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
|
1162
1202
|
|
|
1163
1203
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1164
1204
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
|
@@ -1244,12 +1284,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1244
1284
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1245
1285
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1246
1286
|
} else {
|
|
1247
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l",
|
|
1248
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m",
|
|
1249
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s",
|
|
1250
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l",
|
|
1251
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m",
|
|
1252
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s",
|
|
1287
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1288
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
|
1289
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
|
1290
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
|
1291
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
|
1292
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
|
1253
1293
|
|
|
1254
1294
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1255
1295
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
|
@@ -1342,12 +1382,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1342
1382
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1343
1383
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1344
1384
|
|
|
1345
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l",
|
|
1346
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m",
|
|
1347
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s",
|
|
1348
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l",
|
|
1349
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m",
|
|
1350
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s",
|
|
1385
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1386
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
|
1387
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
1388
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
|
1389
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
|
1390
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
|
1351
1391
|
|
|
1352
1392
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1353
1393
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
|
@@ -1442,11 +1482,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1442
1482
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1443
1483
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1444
1484
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1445
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "
|
|
1446
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "
|
|
1447
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "
|
|
1448
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "
|
|
1449
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "
|
|
1485
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1486
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1487
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1488
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1489
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1450
1490
|
|
|
1451
1491
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1452
1492
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
@@ -1455,11 +1495,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1455
1495
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1456
1496
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1457
1497
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1458
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "
|
|
1459
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "
|
|
1460
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "
|
|
1461
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "
|
|
1462
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "
|
|
1498
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1499
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1500
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1501
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1502
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1463
1503
|
|
|
1464
1504
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1465
1505
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
@@ -1468,11 +1508,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1468
1508
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1469
1509
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1470
1510
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1471
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "
|
|
1472
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "
|
|
1473
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "
|
|
1474
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "
|
|
1475
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "
|
|
1511
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1512
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1513
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1514
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1515
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1476
1516
|
|
|
1477
1517
|
// dequant shaders
|
|
1478
1518
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
|
@@ -1481,11 +1521,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1481
1521
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
|
1482
1522
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
|
1483
1523
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
|
1484
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "
|
|
1485
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "
|
|
1486
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "
|
|
1487
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "
|
|
1488
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "
|
|
1524
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
|
1525
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
|
1526
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
|
1527
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
|
1528
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
|
1489
1529
|
|
|
1490
1530
|
// get_rows
|
|
1491
1531
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
@@ -1537,11 +1577,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1537
1577
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
|
1538
1578
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
|
1539
1579
|
|
|
1540
|
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
|
1541
|
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
|
1580
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1581
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1542
1582
|
|
|
1543
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(
|
|
1544
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(
|
|
1583
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1584
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1545
1585
|
|
|
1546
1586
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
|
1547
1587
|
|
|
@@ -1551,9 +1591,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1551
1591
|
static void ggml_vk_print_gpu_info(size_t idx) {
|
|
1552
1592
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
|
1553
1593
|
size_t dev_num = vk_instance.device_indices[idx];
|
|
1554
|
-
|
|
1555
|
-
std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl;
|
|
1556
|
-
#endif
|
|
1594
|
+
VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")");
|
|
1557
1595
|
GGML_ASSERT(vk_instance.initialized);
|
|
1558
1596
|
|
|
1559
1597
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
|
@@ -1569,8 +1607,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
|
1569
1607
|
vk::PhysicalDeviceProperties2 props2;
|
|
1570
1608
|
vk::PhysicalDeviceMaintenance3Properties props3;
|
|
1571
1609
|
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
|
1610
|
+
vk::PhysicalDeviceDriverProperties driver_props;
|
|
1572
1611
|
props2.pNext = &props3;
|
|
1573
1612
|
props3.pNext = &subgroup_props;
|
|
1613
|
+
subgroup_props.pNext = &driver_props;
|
|
1574
1614
|
physical_device.getProperties2(&props2);
|
|
1575
1615
|
|
|
1576
1616
|
const size_t subgroup_size = subgroup_props.subgroupSize;
|
|
@@ -1614,7 +1654,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
|
1614
1654
|
fp16 = fp16 && vk12_features.shaderFloat16;
|
|
1615
1655
|
|
|
1616
1656
|
std::string device_name = props2.properties.deviceName.data();
|
|
1617
|
-
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
|
1657
|
+
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
|
1618
1658
|
|
|
1619
1659
|
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
|
1620
1660
|
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
|
@@ -1628,9 +1668,7 @@ void ggml_vk_instance_init() {
|
|
|
1628
1668
|
if (vk_instance_initialized) {
|
|
1629
1669
|
return;
|
|
1630
1670
|
}
|
|
1631
|
-
|
|
1632
|
-
std::cerr << "ggml_vk_instance_init()" << std::endl;
|
|
1633
|
-
#endif
|
|
1671
|
+
VK_LOG_DEBUG("ggml_vk_instance_init()");
|
|
1634
1672
|
|
|
1635
1673
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
|
1636
1674
|
|
|
@@ -1707,10 +1745,80 @@ void ggml_vk_instance_init() {
|
|
|
1707
1745
|
|
|
1708
1746
|
// Default to using all dedicated GPUs
|
|
1709
1747
|
for (size_t i = 0; i < devices.size(); i++) {
|
|
1710
|
-
vk::
|
|
1748
|
+
vk::PhysicalDeviceProperties2 new_props;
|
|
1749
|
+
vk::PhysicalDeviceDriverProperties new_driver;
|
|
1750
|
+
vk::PhysicalDeviceIDProperties new_id;
|
|
1751
|
+
new_props.pNext = &new_driver;
|
|
1752
|
+
new_driver.pNext = &new_id;
|
|
1753
|
+
devices[i].getProperties2(&new_props);
|
|
1754
|
+
|
|
1755
|
+
if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
|
1756
|
+
// Check if there are two physical devices corresponding to the same GPU
|
|
1757
|
+
auto old_device = std::find_if(
|
|
1758
|
+
vk_instance.device_indices.begin(),
|
|
1759
|
+
vk_instance.device_indices.end(),
|
|
1760
|
+
[&devices, &new_id](const size_t k){
|
|
1761
|
+
vk::PhysicalDeviceProperties2 old_props;
|
|
1762
|
+
vk::PhysicalDeviceIDProperties old_id;
|
|
1763
|
+
old_props.pNext = &old_id;
|
|
1764
|
+
devices[k].getProperties2(&old_props);
|
|
1765
|
+
return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
|
|
1766
|
+
}
|
|
1767
|
+
);
|
|
1768
|
+
if (old_device == vk_instance.device_indices.end()) {
|
|
1769
|
+
vk_instance.device_indices.push_back(i);
|
|
1770
|
+
} else {
|
|
1771
|
+
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
|
|
1772
|
+
// This can cause error when splitting layers aross the devices, need to keep only 1
|
|
1773
|
+
VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID");
|
|
1774
|
+
|
|
1775
|
+
vk::PhysicalDeviceProperties2 old_props;
|
|
1776
|
+
vk::PhysicalDeviceDriverProperties old_driver;
|
|
1777
|
+
old_props.pNext = &old_driver;
|
|
1778
|
+
devices[*old_device].getProperties2(&old_props);
|
|
1779
|
+
|
|
1780
|
+
std::map<vk::DriverId, int> driver_priorities {};
|
|
1781
|
+
int old_priority = std::numeric_limits<int>::max();
|
|
1782
|
+
int new_priority = std::numeric_limits<int>::max();
|
|
1783
|
+
|
|
1784
|
+
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
|
|
1785
|
+
// Smaller number -> higher priority
|
|
1786
|
+
switch (old_props.properties.vendorID) {
|
|
1787
|
+
case VK_VENDOR_ID_AMD:
|
|
1788
|
+
driver_priorities[vk::DriverId::eMesaRadv] = 1;
|
|
1789
|
+
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
|
|
1790
|
+
driver_priorities[vk::DriverId::eAmdProprietary] = 3;
|
|
1791
|
+
break;
|
|
1792
|
+
case VK_VENDOR_ID_INTEL:
|
|
1793
|
+
driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
|
|
1794
|
+
driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
|
|
1795
|
+
break;
|
|
1796
|
+
case VK_VENDOR_ID_NVIDIA:
|
|
1797
|
+
driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
|
|
1798
|
+
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
|
|
1799
|
+
driver_priorities[vk::DriverId::eMesaNvk] = 2;
|
|
1800
|
+
#endif
|
|
1801
|
+
break;
|
|
1802
|
+
}
|
|
1803
|
+
|
|
1804
|
+
if (driver_priorities.count(old_driver.driverID)) {
|
|
1805
|
+
old_priority = driver_priorities[old_driver.driverID];
|
|
1806
|
+
}
|
|
1807
|
+
if (driver_priorities.count(new_driver.driverID)) {
|
|
1808
|
+
new_priority = driver_priorities[new_driver.driverID];
|
|
1809
|
+
}
|
|
1810
|
+
|
|
1811
|
+
if (new_priority < old_priority) {
|
|
1812
|
+
auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
|
|
1813
|
+
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
|
|
1814
|
+
vk_instance.device_indices.push_back(i);
|
|
1711
1815
|
|
|
1712
|
-
|
|
1713
|
-
|
|
1816
|
+
VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName);
|
|
1817
|
+
}
|
|
1818
|
+
else {
|
|
1819
|
+
VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl);
|
|
1820
|
+
}
|
|
1821
|
+
}
|
|
1714
1822
|
}
|
|
1715
1823
|
}
|
|
1716
1824
|
|
|
@@ -1732,9 +1840,7 @@ void ggml_vk_instance_init() {
|
|
|
1732
1840
|
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1733
1841
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
|
1734
1842
|
size_t dev_num = vk_instance.device_indices[idx];
|
|
1735
|
-
|
|
1736
|
-
std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl;
|
|
1737
|
-
#endif
|
|
1843
|
+
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")");
|
|
1738
1844
|
ggml_vk_instance_init();
|
|
1739
1845
|
|
|
1740
1846
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
|
@@ -1907,9 +2013,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
|
1907
2013
|
}
|
|
1908
2014
|
|
|
1909
2015
|
static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
|
|
1910
|
-
|
|
1911
|
-
std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
|
|
1912
|
-
#endif
|
|
2016
|
+
VK_LOG_DEBUG("ggml_vk_get_to_fp16()");
|
|
1913
2017
|
switch (type) {
|
|
1914
2018
|
case GGML_TYPE_F32:
|
|
1915
2019
|
case GGML_TYPE_Q4_0:
|
|
@@ -1931,9 +2035,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
|
|
1931
2035
|
}
|
|
1932
2036
|
|
|
1933
2037
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
|
1934
|
-
|
|
1935
|
-
std::cerr << "ggml_vk_get_mul_mat_mat_pipeline()" << std::endl;
|
|
1936
|
-
#endif
|
|
2038
|
+
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
|
|
1937
2039
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
|
1938
2040
|
return ctx->device->pipeline_matmul_f32;
|
|
1939
2041
|
}
|
|
@@ -1969,9 +2071,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
|
1969
2071
|
}
|
|
1970
2072
|
|
|
1971
2073
|
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
|
1972
|
-
|
|
1973
|
-
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
|
1974
|
-
#endif
|
|
2074
|
+
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
|
1975
2075
|
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
|
|
1976
2076
|
|
|
1977
2077
|
switch (a_type) {
|
|
@@ -1996,9 +2096,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
|
|
1996
2096
|
}
|
|
1997
2097
|
|
|
1998
2098
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
|
1999
|
-
|
|
2000
|
-
std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl;
|
|
2001
|
-
#endif
|
|
2099
|
+
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()");
|
|
2002
2100
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
|
2003
2101
|
return ctx->device->pipeline_matmul_id_f32;
|
|
2004
2102
|
}
|
|
@@ -2031,9 +2129,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
|
|
2031
2129
|
}
|
|
2032
2130
|
|
|
2033
2131
|
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
|
2034
|
-
|
|
2035
|
-
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
|
2036
|
-
#endif
|
|
2132
|
+
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
|
2037
2133
|
GGML_ASSERT(b_type == GGML_TYPE_F32);
|
|
2038
2134
|
|
|
2039
2135
|
switch (a_type) {
|
|
@@ -2058,9 +2154,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
|
|
|
2058
2154
|
}
|
|
2059
2155
|
|
|
2060
2156
|
static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2157
|
+
VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
|
|
2158
|
+
VK_LOG_MEMORY("ggml_vk_pool_malloc");
|
|
2159
|
+
|
|
2064
2160
|
int best_i = -1;
|
|
2065
2161
|
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
|
|
2066
2162
|
int worst_i = -1;
|
|
@@ -2088,13 +2184,11 @@ static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size)
|
|
|
2088
2184
|
ggml_vk_destroy_buffer(b);
|
|
2089
2185
|
}
|
|
2090
2186
|
|
|
2091
|
-
return
|
|
2187
|
+
return ggml_vk_create_buffer_device(ctx, size);
|
|
2092
2188
|
}
|
|
2093
2189
|
|
|
2094
2190
|
static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
|
|
2095
|
-
|
|
2096
|
-
std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl;
|
|
2097
|
-
#endif
|
|
2191
|
+
VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
|
|
2098
2192
|
for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
|
|
2099
2193
|
vk_buffer& b = ctx->buffer_pool[i];
|
|
2100
2194
|
if (b == nullptr) {
|
|
@@ -2115,6 +2209,8 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
|
|
|
2115
2209
|
}
|
|
2116
2210
|
}
|
|
2117
2211
|
|
|
2212
|
+
VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
|
|
2213
|
+
|
|
2118
2214
|
// Otherwise create new buffer
|
|
2119
2215
|
vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
|
|
2120
2216
|
ctx->gc.temp_buffers.push_back(buf);
|
|
@@ -2123,9 +2219,7 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
|
|
|
2123
2219
|
}
|
|
2124
2220
|
|
|
2125
2221
|
static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
|
2126
|
-
|
|
2127
|
-
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
|
2128
|
-
#endif
|
|
2222
|
+
VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
|
|
2129
2223
|
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
|
2130
2224
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
|
2131
2225
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
@@ -2147,9 +2241,7 @@ static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) {
|
|
|
2147
2241
|
if (ptr == nullptr) {
|
|
2148
2242
|
return;
|
|
2149
2243
|
}
|
|
2150
|
-
|
|
2151
|
-
std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl;
|
|
2152
|
-
#endif
|
|
2244
|
+
VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
|
|
2153
2245
|
vk_buffer buf;
|
|
2154
2246
|
size_t index;
|
|
2155
2247
|
for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
|
|
@@ -2201,13 +2293,11 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context
|
|
|
2201
2293
|
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
|
2202
2294
|
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
|
2203
2295
|
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
|
2204
|
-
|
|
2205
|
-
std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
|
2296
|
+
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
|
2206
2297
|
for (auto& buffer : buffers) {
|
|
2207
2298
|
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
|
|
2208
2299
|
}
|
|
2209
|
-
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"
|
|
2210
|
-
#endif
|
|
2300
|
+
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
|
2211
2301
|
std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
|
|
2212
2302
|
std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
|
|
2213
2303
|
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
|
@@ -2240,9 +2330,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
|
|
|
2240
2330
|
}
|
|
2241
2331
|
|
|
2242
2332
|
static void ggml_vk_ctx_end(vk_context * ctx) {
|
|
2243
|
-
|
|
2244
|
-
std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl;
|
|
2245
|
-
#endif
|
|
2333
|
+
VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
|
|
2246
2334
|
if (ctx->s == nullptr) {
|
|
2247
2335
|
return;
|
|
2248
2336
|
}
|
|
@@ -2252,9 +2340,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
|
|
|
2252
2340
|
}
|
|
2253
2341
|
|
|
2254
2342
|
static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) {
|
|
2255
|
-
|
|
2256
|
-
std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl;
|
|
2257
|
-
#endif
|
|
2343
|
+
VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")");
|
|
2258
2344
|
if (subctx->s != nullptr) {
|
|
2259
2345
|
ggml_vk_ctx_end(subctx);
|
|
2260
2346
|
}
|
|
@@ -2264,9 +2350,7 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
|
2264
2350
|
}
|
|
2265
2351
|
|
|
2266
2352
|
static size_t ggml_vk_align_size(size_t width, size_t align) {
|
|
2267
|
-
|
|
2268
|
-
std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl;
|
|
2269
|
-
#endif
|
|
2353
|
+
VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
|
|
2270
2354
|
return CEIL_DIV(width, align) * align;
|
|
2271
2355
|
}
|
|
2272
2356
|
|
|
@@ -2280,6 +2364,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|
|
2280
2364
|
|
|
2281
2365
|
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
|
2282
2366
|
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
|
2367
|
+
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
|
|
2283
2368
|
ggml_vk_destroy_buffer(ctx->sync_staging);
|
|
2284
2369
|
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
|
2285
2370
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
|
@@ -2288,9 +2373,7 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si
|
|
|
2288
2373
|
}
|
|
2289
2374
|
|
|
2290
2375
|
static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
|
|
2291
|
-
|
|
2292
|
-
std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
|
|
2293
|
-
#endif
|
|
2376
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
|
|
2294
2377
|
GGML_ASSERT(!ggml_is_contiguous(tensor));
|
|
2295
2378
|
// Buffer is already mapped
|
|
2296
2379
|
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
|
@@ -2395,9 +2478,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|
|
2395
2478
|
}
|
|
2396
2479
|
|
|
2397
2480
|
static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
|
|
2398
|
-
|
|
2399
|
-
std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
|
|
2400
|
-
#endif
|
|
2481
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
|
|
2401
2482
|
// Make sure ctx owns the buffer
|
|
2402
2483
|
GGML_ASSERT(dst->ctx == ctx);
|
|
2403
2484
|
|
|
@@ -2432,9 +2513,7 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
|
|
|
2432
2513
|
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
|
2433
2514
|
return;
|
|
2434
2515
|
}
|
|
2435
|
-
|
|
2436
|
-
std::cerr << "STAGING" << std::endl;
|
|
2437
|
-
#endif
|
|
2516
|
+
VK_LOG_DEBUG("STAGING");
|
|
2438
2517
|
|
|
2439
2518
|
// Staging buffer required
|
|
2440
2519
|
vk_buffer staging = ctx->staging;
|
|
@@ -2469,16 +2548,12 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
|
|
|
2469
2548
|
}
|
|
2470
2549
|
|
|
2471
2550
|
static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
|
|
2472
|
-
|
|
2473
|
-
std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
|
|
2474
|
-
#endif
|
|
2551
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
|
|
2475
2552
|
return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging);
|
|
2476
2553
|
}
|
|
2477
2554
|
|
|
2478
2555
|
static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
|
|
2479
|
-
|
|
2480
|
-
std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
|
|
2481
|
-
#endif
|
|
2556
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
|
|
2482
2557
|
// Buffer is already mapped
|
|
2483
2558
|
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
|
2484
2559
|
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
@@ -2503,16 +2578,12 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
|
2503
2578
|
}
|
|
2504
2579
|
|
|
2505
2580
|
static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) {
|
|
2506
|
-
|
|
2507
|
-
std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
|
|
2508
|
-
#endif
|
|
2581
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
|
|
2509
2582
|
ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1);
|
|
2510
2583
|
}
|
|
2511
2584
|
|
|
2512
2585
|
static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
|
|
2513
|
-
|
|
2514
|
-
std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
|
|
2515
|
-
#endif
|
|
2586
|
+
VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
|
|
2516
2587
|
GGML_ASSERT(width > 0);
|
|
2517
2588
|
GGML_ASSERT(height > 0);
|
|
2518
2589
|
GGML_ASSERT(src != nullptr);
|
|
@@ -2546,9 +2617,7 @@ static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_conte
|
|
|
2546
2617
|
|
|
2547
2618
|
return;
|
|
2548
2619
|
}
|
|
2549
|
-
|
|
2550
|
-
std::cerr << "STAGING" << std::endl;
|
|
2551
|
-
#endif
|
|
2620
|
+
VK_LOG_DEBUG("STAGING");
|
|
2552
2621
|
|
|
2553
2622
|
// Fall back to staging buffer
|
|
2554
2623
|
vk_buffer staging = ctx->staging;
|
|
@@ -2575,9 +2644,7 @@ static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context
|
|
|
2575
2644
|
}
|
|
2576
2645
|
|
|
2577
2646
|
static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) {
|
|
2578
|
-
|
|
2579
|
-
std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl;
|
|
2580
|
-
#endif
|
|
2647
|
+
VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
|
|
2581
2648
|
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
|
2582
2649
|
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
2583
2650
|
|
|
@@ -2599,9 +2666,7 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
|
|
|
2599
2666
|
}
|
|
2600
2667
|
|
|
2601
2668
|
static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
|
2602
|
-
|
|
2603
|
-
std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl;
|
|
2604
|
-
#endif
|
|
2669
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
|
|
2605
2670
|
// Make sure both buffers are on same ctx
|
|
2606
2671
|
GGML_ASSERT(src->ctx == dst->ctx);
|
|
2607
2672
|
|
|
@@ -2612,9 +2677,7 @@ static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t d
|
|
|
2612
2677
|
|
|
2613
2678
|
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
|
2614
2679
|
if (src->ctx == dst->ctx) {
|
|
2615
|
-
|
|
2616
|
-
std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl;
|
|
2617
|
-
#endif
|
|
2680
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
|
2618
2681
|
// Copy within the device
|
|
2619
2682
|
ggml_backend_vk_context * ctx = src->ctx;
|
|
2620
2683
|
|
|
@@ -2626,9 +2689,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
|
2626
2689
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
|
2627
2690
|
ctx->device->device.resetFences({ ctx->fence });
|
|
2628
2691
|
} else {
|
|
2629
|
-
|
|
2630
|
-
std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
|
|
2631
|
-
#endif
|
|
2692
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
|
2632
2693
|
// Copy device to device
|
|
2633
2694
|
ggml_backend_vk_context * src_ctx = src->ctx;
|
|
2634
2695
|
ggml_backend_vk_context * dst_ctx = dst->ctx;
|
|
@@ -2646,9 +2707,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
|
2646
2707
|
}
|
|
2647
2708
|
|
|
2648
2709
|
static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
|
2649
|
-
|
|
2650
|
-
std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
|
|
2651
|
-
#endif
|
|
2710
|
+
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
|
2652
2711
|
// Make sure ctx owns the buffer
|
|
2653
2712
|
GGML_ASSERT(dst->ctx == ctx);
|
|
2654
2713
|
|
|
@@ -2663,9 +2722,7 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst,
|
|
|
2663
2722
|
}
|
|
2664
2723
|
|
|
2665
2724
|
static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
|
|
2666
|
-
|
|
2667
|
-
std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
|
|
2668
|
-
#endif
|
|
2725
|
+
VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")");
|
|
2669
2726
|
const uint64_t ne0 = src->ne[0];
|
|
2670
2727
|
const uint64_t ne1 = src->ne[1];
|
|
2671
2728
|
const uint64_t nb0 = src->nb[0];
|
|
@@ -2693,9 +2750,7 @@ static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2693
2750
|
}
|
|
2694
2751
|
|
|
2695
2752
|
static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) {
|
|
2696
|
-
|
|
2697
|
-
std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl;
|
|
2698
|
-
#endif
|
|
2753
|
+
VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()");
|
|
2699
2754
|
const uint64_t ne0 = dst->ne[0];
|
|
2700
2755
|
const uint64_t ne1 = dst->ne[1];
|
|
2701
2756
|
const uint64_t ne2 = dst->ne[2];
|
|
@@ -2719,9 +2774,7 @@ static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2719
2774
|
}
|
|
2720
2775
|
|
|
2721
2776
|
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
|
|
2722
|
-
|
|
2723
|
-
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
|
|
2724
|
-
#endif
|
|
2777
|
+
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
|
|
2725
2778
|
// if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
|
|
2726
2779
|
// return 4;
|
|
2727
2780
|
// }
|
|
@@ -2753,9 +2806,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context *
|
|
|
2753
2806
|
}
|
|
2754
2807
|
|
|
2755
2808
|
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
|
|
2756
|
-
|
|
2757
|
-
std::cerr << "ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")" << std::endl;
|
|
2758
|
-
#endif
|
|
2809
|
+
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
|
|
2759
2810
|
switch (ctx->device->vendor_id) {
|
|
2760
2811
|
case VK_VENDOR_ID_AMD:
|
|
2761
2812
|
return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned);
|
|
@@ -2777,9 +2828,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
|
2777
2828
|
}
|
|
2778
2829
|
|
|
2779
2830
|
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
|
|
2780
|
-
|
|
2781
|
-
std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
|
|
2782
|
-
#endif
|
|
2831
|
+
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
|
|
2783
2832
|
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
|
|
2784
2833
|
}
|
|
2785
2834
|
|
|
@@ -2789,9 +2838,7 @@ static void ggml_vk_matmul(
|
|
|
2789
2838
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
|
2790
2839
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
|
2791
2840
|
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
|
|
2792
|
-
|
|
2793
|
-
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")" << std::endl;
|
|
2794
|
-
#endif
|
|
2841
|
+
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
|
|
2795
2842
|
ggml_vk_sync_buffers(subctx);
|
|
2796
2843
|
if (split_k == 1) {
|
|
2797
2844
|
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
|
|
@@ -2815,12 +2862,10 @@ static void ggml_vk_matmul_id(
|
|
|
2815
2862
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
|
2816
2863
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
|
2817
2864
|
uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
|
|
2818
|
-
|
|
2819
|
-
std::cerr << "ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
|
2865
|
+
VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
|
2820
2866
|
"m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
|
|
2821
2867
|
"batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
|
|
2822
|
-
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"
|
|
2823
|
-
#endif
|
|
2868
|
+
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
|
|
2824
2869
|
ggml_vk_sync_buffers(subctx);
|
|
2825
2870
|
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
|
2826
2871
|
nei0, nei1, nbi1, ne11 };
|
|
@@ -2850,10 +2895,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
|
|
|
2850
2895
|
}
|
|
2851
2896
|
|
|
2852
2897
|
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
|
2853
|
-
|
|
2854
|
-
std::cerr << "
|
|
2855
|
-
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
|
|
2856
|
-
#endif
|
|
2898
|
+
VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
|
2899
|
+
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
|
|
2857
2900
|
const int tensor_type_size = ggml_type_size(tensor->type);
|
|
2858
2901
|
|
|
2859
2902
|
const uint32_t ne = ggml_nelements(tensor);
|
|
@@ -2870,11 +2913,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
|
|
|
2870
2913
|
}
|
|
2871
2914
|
|
|
2872
2915
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
2873
|
-
|
|
2874
|
-
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
2916
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
2875
2917
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
2876
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
|
2877
|
-
#endif
|
|
2918
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
|
2878
2919
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
|
2879
2920
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
2880
2921
|
|
|
@@ -2949,7 +2990,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2949
2990
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
2950
2991
|
|
|
2951
2992
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
2952
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
2993
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
2953
2994
|
GGML_ASSERT(d_D != nullptr);
|
|
2954
2995
|
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
|
2955
2996
|
vk_buffer d_X;
|
|
@@ -2958,12 +2999,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2958
2999
|
uint64_t y_buf_offset = 0;
|
|
2959
3000
|
if (!src0_uma) {
|
|
2960
3001
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
2961
|
-
qx_buf_offset = extra_src0->offset;
|
|
3002
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
2962
3003
|
GGML_ASSERT(d_Qx != nullptr);
|
|
2963
3004
|
}
|
|
2964
3005
|
if (!src1_uma) {
|
|
2965
3006
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
2966
|
-
qy_buf_offset = extra_src1->offset;
|
|
3007
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
2967
3008
|
GGML_ASSERT(d_Qy != nullptr);
|
|
2968
3009
|
}
|
|
2969
3010
|
if (qx_needs_dequant) {
|
|
@@ -3045,11 +3086,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
3045
3086
|
}
|
|
3046
3087
|
|
|
3047
3088
|
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3048
|
-
|
|
3049
|
-
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3089
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3050
3090
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3051
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
|
3052
|
-
#endif
|
|
3091
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
|
3053
3092
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
|
3054
3093
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
3055
3094
|
|
|
@@ -3114,7 +3153,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3114
3153
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3115
3154
|
|
|
3116
3155
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3117
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3156
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3118
3157
|
GGML_ASSERT(d_D != nullptr);
|
|
3119
3158
|
vk_buffer d_X;
|
|
3120
3159
|
uint64_t x_buf_offset = 0;
|
|
@@ -3122,12 +3161,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3122
3161
|
uint64_t y_buf_offset = 0;
|
|
3123
3162
|
if(!src0_uma) {
|
|
3124
3163
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
3125
|
-
qx_buf_offset = extra_src0->offset;
|
|
3164
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3126
3165
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3127
3166
|
}
|
|
3128
3167
|
if(!src1_uma) {
|
|
3129
3168
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3130
|
-
qy_buf_offset = extra_src1->offset;
|
|
3169
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3131
3170
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3132
3171
|
}
|
|
3133
3172
|
if (qx_needs_dequant) {
|
|
@@ -3200,11 +3239,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3200
3239
|
}
|
|
3201
3240
|
|
|
3202
3241
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3203
|
-
|
|
3204
|
-
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3242
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3205
3243
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3206
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
|
3207
|
-
#endif
|
|
3244
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
|
3208
3245
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
|
3209
3246
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
|
3210
3247
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
|
@@ -3246,14 +3283,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
3246
3283
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3247
3284
|
|
|
3248
3285
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3249
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3286
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3250
3287
|
GGML_ASSERT(d_D != nullptr);
|
|
3251
3288
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
|
3252
|
-
const uint64_t qx_buf_offset = extra_src0->offset;
|
|
3289
|
+
const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3253
3290
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3254
3291
|
if (!src1_uma) {
|
|
3255
3292
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3256
|
-
qy_buf_offset = extra_src1->offset;
|
|
3293
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3257
3294
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3258
3295
|
}
|
|
3259
3296
|
|
|
@@ -3273,11 +3310,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
3273
3310
|
}
|
|
3274
3311
|
|
|
3275
3312
|
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3276
|
-
|
|
3277
|
-
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3313
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3278
3314
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3279
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
|
3280
|
-
#endif
|
|
3315
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
|
3281
3316
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
|
3282
3317
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
3283
3318
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
|
@@ -3323,14 +3358,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
3323
3358
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3324
3359
|
|
|
3325
3360
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3326
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3361
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3327
3362
|
GGML_ASSERT(d_D != nullptr);
|
|
3328
3363
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
|
3329
|
-
const uint64_t qx_buf_offset = extra_src0->offset;
|
|
3364
|
+
const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3330
3365
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3331
3366
|
if (!src1_uma) {
|
|
3332
3367
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3333
|
-
qy_buf_offset = extra_src1->offset;
|
|
3368
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3334
3369
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3335
3370
|
}
|
|
3336
3371
|
|
|
@@ -3350,9 +3385,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
3350
3385
|
}
|
|
3351
3386
|
|
|
3352
3387
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3353
|
-
|
|
3354
|
-
std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
|
|
3355
|
-
#endif
|
|
3388
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
|
|
3356
3389
|
if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
|
|
3357
3390
|
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
|
|
3358
3391
|
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
|
|
@@ -3365,12 +3398,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
3365
3398
|
}
|
|
3366
3399
|
|
|
3367
3400
|
static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
|
3368
|
-
|
|
3369
|
-
std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3401
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3370
3402
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3371
3403
|
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
|
3372
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
|
3373
|
-
#endif
|
|
3404
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
|
3374
3405
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
3375
3406
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
|
3376
3407
|
|
|
@@ -3459,7 +3490,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
|
3459
3490
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3460
3491
|
|
|
3461
3492
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3462
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3493
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3463
3494
|
GGML_ASSERT(d_D != nullptr);
|
|
3464
3495
|
vk_buffer d_X;
|
|
3465
3496
|
uint64_t x_buf_offset = 0;
|
|
@@ -3467,17 +3498,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
|
3467
3498
|
uint64_t y_buf_offset = 0;
|
|
3468
3499
|
if (!src0_uma) {
|
|
3469
3500
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
3470
|
-
qx_buf_offset = extra_src0->offset;
|
|
3501
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3471
3502
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3472
3503
|
}
|
|
3473
3504
|
if (!src1_uma) {
|
|
3474
3505
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3475
|
-
qy_buf_offset = extra_src1->offset;
|
|
3506
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3476
3507
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3477
3508
|
}
|
|
3478
3509
|
if (!ids_uma) {
|
|
3479
3510
|
d_ids = extra_ids->buffer_gpu.lock();
|
|
3480
|
-
ids_buf_offset = extra_ids->offset;
|
|
3511
|
+
ids_buf_offset = extra_ids->offset + ids->view_offs;
|
|
3481
3512
|
GGML_ASSERT(d_ids != nullptr);
|
|
3482
3513
|
}
|
|
3483
3514
|
if (qx_needs_dequant) {
|
|
@@ -3556,12 +3587,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
|
3556
3587
|
}
|
|
3557
3588
|
|
|
3558
3589
|
static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
|
3559
|
-
|
|
3560
|
-
std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3590
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3561
3591
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3562
3592
|
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
|
3563
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
|
3564
|
-
#endif
|
|
3593
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
|
3565
3594
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
|
3566
3595
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
3567
3596
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
|
@@ -3636,7 +3665,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3636
3665
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3637
3666
|
|
|
3638
3667
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3639
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3668
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3640
3669
|
GGML_ASSERT(d_D != nullptr);
|
|
3641
3670
|
vk_buffer d_X;
|
|
3642
3671
|
uint64_t x_buf_offset = 0;
|
|
@@ -3644,17 +3673,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3644
3673
|
uint64_t y_buf_offset = 0;
|
|
3645
3674
|
if(!src0_uma) {
|
|
3646
3675
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
3647
|
-
qx_buf_offset = extra_src0->offset;
|
|
3676
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3648
3677
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3649
3678
|
}
|
|
3650
3679
|
if(!src1_uma) {
|
|
3651
3680
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3652
|
-
qy_buf_offset = extra_src1->offset;
|
|
3681
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3653
3682
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3654
3683
|
}
|
|
3655
3684
|
if(!ids_uma) {
|
|
3656
3685
|
d_ids = extra_ids->buffer_gpu.lock();
|
|
3657
|
-
ids_buf_offset = extra_ids->offset;
|
|
3686
|
+
ids_buf_offset = extra_ids->offset + ids->view_offs;
|
|
3658
3687
|
GGML_ASSERT(d_ids != nullptr);
|
|
3659
3688
|
}
|
|
3660
3689
|
if (qx_needs_dequant) {
|
|
@@ -3724,9 +3753,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3724
3753
|
}
|
|
3725
3754
|
|
|
3726
3755
|
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
3727
|
-
|
|
3728
|
-
std::cerr << "ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")" << std::endl;
|
|
3729
|
-
#endif
|
|
3756
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
|
|
3730
3757
|
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
|
3731
3758
|
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
|
|
3732
3759
|
} else {
|
|
@@ -3769,9 +3796,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
|
3769
3796
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
|
3770
3797
|
|
|
3771
3798
|
const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
|
|
3772
|
-
const uint64_t src_offset = extra_src0->offset;
|
|
3799
|
+
const uint64_t src_offset = extra_src0->offset + src0->view_offs;
|
|
3773
3800
|
vk_buffer dst_buf = extra->buffer_gpu.lock();
|
|
3774
|
-
const uint64_t dst_offset = extra->offset;
|
|
3801
|
+
const uint64_t dst_offset = extra->offset + dst->view_offs;
|
|
3775
3802
|
|
|
3776
3803
|
std::vector<vk::BufferCopy> copies;
|
|
3777
3804
|
|
|
@@ -3908,10 +3935,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3908
3935
|
}
|
|
3909
3936
|
} else {
|
|
3910
3937
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3911
|
-
return ctx->device->
|
|
3938
|
+
return ctx->device->pipeline_rope_norm_f32;
|
|
3912
3939
|
}
|
|
3913
3940
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
3914
|
-
return ctx->device->
|
|
3941
|
+
return ctx->device->pipeline_rope_norm_f16;
|
|
3915
3942
|
}
|
|
3916
3943
|
}
|
|
3917
3944
|
return nullptr;
|
|
@@ -3960,16 +3987,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
|
3960
3987
|
|
|
3961
3988
|
template<typename PC>
|
|
3962
3989
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
|
3963
|
-
|
|
3964
|
-
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3990
|
+
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3965
3991
|
if (src1 != nullptr) {
|
|
3966
3992
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3967
3993
|
}
|
|
3968
3994
|
if (src2 != nullptr) {
|
|
3969
3995
|
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
|
3970
3996
|
}
|
|
3971
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")"
|
|
3972
|
-
#endif
|
|
3997
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")");
|
|
3973
3998
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
|
3974
3999
|
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
|
3975
4000
|
GGML_ASSERT(dst->extra != nullptr);
|
|
@@ -4062,21 +4087,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4062
4087
|
}
|
|
4063
4088
|
|
|
4064
4089
|
GGML_ASSERT(d_D != nullptr);
|
|
4065
|
-
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
|
4090
|
+
uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
|
4066
4091
|
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
|
4067
4092
|
if(!src0_uma) {
|
|
4068
4093
|
d_X = extra_src0->buffer_gpu.lock();
|
|
4069
|
-
x_buf_offset = extra_src0->offset;
|
|
4094
|
+
x_buf_offset = extra_src0->offset + src0->view_offs;
|
|
4070
4095
|
GGML_ASSERT(d_X != nullptr);
|
|
4071
4096
|
}
|
|
4072
4097
|
if (use_src1 && !src1_uma) {
|
|
4073
4098
|
d_Y = extra_src1->buffer_gpu.lock();
|
|
4074
|
-
y_buf_offset = extra_src1->offset;
|
|
4099
|
+
y_buf_offset = extra_src1->offset + src1->view_offs;
|
|
4075
4100
|
GGML_ASSERT(d_Y != nullptr);
|
|
4076
4101
|
}
|
|
4077
4102
|
if (use_src2 && !src2_uma) {
|
|
4078
4103
|
d_Z = extra_src2->buffer_gpu.lock();
|
|
4079
|
-
z_buf_offset = extra_src2->offset;
|
|
4104
|
+
z_buf_offset = extra_src2->offset + src2->view_offs;
|
|
4080
4105
|
GGML_ASSERT(d_Z != nullptr);
|
|
4081
4106
|
}
|
|
4082
4107
|
|
|
@@ -4155,24 +4180,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4155
4180
|
ggml_vk_sync_buffers(subctx);
|
|
4156
4181
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4157
4182
|
} else if (op == GGML_OP_ROPE) {
|
|
4158
|
-
|
|
4159
|
-
|
|
4160
|
-
|
|
4161
|
-
|
|
4162
|
-
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
4163
|
-
vk_subbuffer subbuf_z;
|
|
4164
|
-
if (use_src2) {
|
|
4165
|
-
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
|
4166
|
-
} else {
|
|
4167
|
-
subbuf_z = { d_X, 0, d_X->size };
|
|
4168
|
-
}
|
|
4169
|
-
|
|
4170
|
-
ggml_vk_sync_buffers(subctx);
|
|
4171
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4183
|
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
4184
|
+
vk_subbuffer subbuf_z;
|
|
4185
|
+
if (use_src2) {
|
|
4186
|
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
|
4172
4187
|
} else {
|
|
4173
|
-
|
|
4174
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4188
|
+
subbuf_z = { d_X, 0, d_X->size };
|
|
4175
4189
|
}
|
|
4190
|
+
|
|
4191
|
+
ggml_vk_sync_buffers(subctx);
|
|
4192
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4176
4193
|
} else if (use_src2) {
|
|
4177
4194
|
ggml_vk_sync_buffers(subctx);
|
|
4178
4195
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
@@ -4336,7 +4353,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
|
4336
4353
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
4337
4354
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4338
4355
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4339
|
-
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
|
4356
|
+
const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
|
4340
4357
|
|
|
4341
4358
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
|
4342
4359
|
(uint32_t)ggml_nelements(src0),
|
|
@@ -4394,7 +4411,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
4394
4411
|
|
|
4395
4412
|
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
4396
4413
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
4397
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
|
4414
|
+
// const int mode = ((int32_t *) dst->op_params)[2];
|
|
4398
4415
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
4399
4416
|
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
4400
4417
|
const float freq_base = ((float *) dst->op_params)[5];
|
|
@@ -4404,28 +4421,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
|
4404
4421
|
const float beta_fast = ((float *) dst->op_params)[9];
|
|
4405
4422
|
const float beta_slow = ((float *) dst->op_params)[10];
|
|
4406
4423
|
|
|
4407
|
-
const bool is_neox = mode & 2;
|
|
4408
|
-
|
|
4409
|
-
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
|
4410
|
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
|
4411
|
-
|
|
4412
4424
|
float corr_dims[2];
|
|
4413
4425
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
4414
4426
|
|
|
4415
|
-
|
|
4416
|
-
|
|
4417
|
-
|
|
4418
|
-
|
|
4419
|
-
|
|
4420
|
-
|
|
4421
|
-
|
|
4422
|
-
});
|
|
4423
|
-
} else {
|
|
4424
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
|
4425
|
-
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
|
4426
|
-
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
|
4427
|
-
});
|
|
4428
|
-
}
|
|
4427
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
4428
|
+
|
|
4429
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
|
4430
|
+
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
|
4431
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
|
|
4432
|
+
src2 != nullptr,
|
|
4433
|
+
});
|
|
4429
4434
|
}
|
|
4430
4435
|
|
|
4431
4436
|
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -4487,9 +4492,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
|
|
|
4487
4492
|
|
|
4488
4493
|
template <typename X_TYPE, typename Y_TYPE>
|
|
4489
4494
|
static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
|
|
4490
|
-
|
|
4491
|
-
std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl;
|
|
4492
|
-
#endif
|
|
4495
|
+
VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")");
|
|
4493
4496
|
const size_t x_ne = m * k * batch;
|
|
4494
4497
|
const size_t y_ne = k * n * batch;
|
|
4495
4498
|
const size_t d_ne = m * n * batch;
|
|
@@ -4903,9 +4906,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
|
|
|
4903
4906
|
}
|
|
4904
4907
|
|
|
4905
4908
|
static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
|
|
4906
|
-
|
|
4907
|
-
std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
|
|
4908
|
-
#endif
|
|
4909
|
+
VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
|
|
4909
4910
|
// Check transfers are correct
|
|
4910
4911
|
vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
4911
4912
|
|
|
@@ -4989,9 +4990,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
|
|
|
4989
4990
|
}
|
|
4990
4991
|
|
|
4991
4992
|
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
|
4992
|
-
|
|
4993
|
-
std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
|
|
4994
|
-
#endif
|
|
4993
|
+
VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
|
|
4995
4994
|
const size_t x_sz = sizeof(float) * ne;
|
|
4996
4995
|
const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
|
|
4997
4996
|
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
|
@@ -5068,9 +5067,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
|
5068
5067
|
}
|
|
5069
5068
|
|
|
5070
5069
|
static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
|
|
5071
|
-
|
|
5072
|
-
std::cerr << "ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")" << std::endl;
|
|
5073
|
-
#endif
|
|
5070
|
+
VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
|
|
5074
5071
|
const size_t x_ne = m * k * batch;
|
|
5075
5072
|
const size_t y_ne = k * n * batch;
|
|
5076
5073
|
const size_t d_ne = m * n * batch;
|
|
@@ -5254,9 +5251,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
5254
5251
|
#endif
|
|
5255
5252
|
|
|
5256
5253
|
static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
|
|
5257
|
-
|
|
5258
|
-
std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
|
|
5259
|
-
#endif
|
|
5254
|
+
VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
|
|
5260
5255
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
|
5261
5256
|
extra->reset();
|
|
5262
5257
|
tensor->extra = extra;
|
|
@@ -5264,9 +5259,7 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|
|
5264
5259
|
}
|
|
5265
5260
|
|
|
5266
5261
|
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
|
5267
|
-
|
|
5268
|
-
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
|
5269
|
-
#endif
|
|
5262
|
+
VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
|
|
5270
5263
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
|
5271
5264
|
|
|
5272
5265
|
if (extra == nullptr) {
|
|
@@ -5301,7 +5294,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
|
5301
5294
|
|
|
5302
5295
|
bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
|
5303
5296
|
|
|
5304
|
-
const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
|
|
5297
|
+
const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
|
|
5305
5298
|
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
|
5306
5299
|
|
|
5307
5300
|
int split_k;
|
|
@@ -5379,9 +5372,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
|
5379
5372
|
}
|
|
5380
5373
|
|
|
5381
5374
|
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5382
|
-
#ifdef GGML_VULKAN_DEBUG
|
|
5383
|
-
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
|
5384
|
-
#endif
|
|
5385
5375
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
|
5386
5376
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
|
5387
5377
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
|
@@ -5520,6 +5510,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
|
5520
5510
|
#endif
|
|
5521
5511
|
|
|
5522
5512
|
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
|
5513
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
|
|
5523
5514
|
// Resize buffer
|
|
5524
5515
|
if (ctx->prealloc_x != nullptr) {
|
|
5525
5516
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
|
@@ -5527,6 +5518,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
|
5527
5518
|
ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x);
|
|
5528
5519
|
}
|
|
5529
5520
|
if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
|
|
5521
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")");
|
|
5530
5522
|
// Resize buffer
|
|
5531
5523
|
if (ctx->prealloc_y != nullptr) {
|
|
5532
5524
|
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
|
@@ -5534,6 +5526,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
|
5534
5526
|
ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y);
|
|
5535
5527
|
}
|
|
5536
5528
|
if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
|
|
5529
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
|
|
5537
5530
|
// Resize buffer
|
|
5538
5531
|
if (ctx->prealloc_split_k != nullptr) {
|
|
5539
5532
|
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
|
@@ -5541,6 +5534,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
|
5541
5534
|
ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k);
|
|
5542
5535
|
}
|
|
5543
5536
|
if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
|
|
5537
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
|
|
5544
5538
|
// Resize buffer
|
|
5545
5539
|
if (ctx->staging != nullptr) {
|
|
5546
5540
|
ggml_vk_destroy_buffer(ctx->staging);
|
|
@@ -5558,9 +5552,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5558
5552
|
return;
|
|
5559
5553
|
}
|
|
5560
5554
|
|
|
5561
|
-
|
|
5562
|
-
std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl;
|
|
5563
|
-
#endif
|
|
5555
|
+
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
|
5564
5556
|
ctx->semaphore_idx = 0;
|
|
5565
5557
|
ctx->staging_offset = 0;
|
|
5566
5558
|
|
|
@@ -5569,6 +5561,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5569
5561
|
const ggml_tensor * src2 = node->src[2];
|
|
5570
5562
|
|
|
5571
5563
|
switch (node->op) {
|
|
5564
|
+
// Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
|
|
5565
|
+
case GGML_OP_RESHAPE:
|
|
5566
|
+
case GGML_OP_VIEW:
|
|
5567
|
+
case GGML_OP_PERMUTE:
|
|
5568
|
+
case GGML_OP_TRANSPOSE:
|
|
5569
|
+
case GGML_OP_NONE:
|
|
5570
|
+
return;
|
|
5572
5571
|
case GGML_OP_UNARY:
|
|
5573
5572
|
switch (ggml_get_unary_op(node)) {
|
|
5574
5573
|
case GGML_UNARY_OP_SILU:
|
|
@@ -5590,10 +5589,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5590
5589
|
case GGML_OP_CPY:
|
|
5591
5590
|
case GGML_OP_CONT:
|
|
5592
5591
|
case GGML_OP_DUP:
|
|
5593
|
-
case GGML_OP_RESHAPE:
|
|
5594
|
-
case GGML_OP_VIEW:
|
|
5595
|
-
case GGML_OP_PERMUTE:
|
|
5596
|
-
case GGML_OP_TRANSPOSE:
|
|
5597
5592
|
case GGML_OP_NORM:
|
|
5598
5593
|
case GGML_OP_RMS_NORM:
|
|
5599
5594
|
case GGML_OP_DIAG_MASK_INF:
|
|
@@ -5601,7 +5596,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5601
5596
|
case GGML_OP_ROPE:
|
|
5602
5597
|
case GGML_OP_MUL_MAT:
|
|
5603
5598
|
case GGML_OP_MUL_MAT_ID:
|
|
5604
|
-
case GGML_OP_NONE:
|
|
5605
5599
|
case GGML_OP_ARGSORT:
|
|
5606
5600
|
case GGML_OP_SUM_ROWS:
|
|
5607
5601
|
break;
|
|
@@ -5654,12 +5648,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5654
5648
|
case GGML_OP_DUP:
|
|
5655
5649
|
ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
|
|
5656
5650
|
|
|
5657
|
-
break;
|
|
5658
|
-
case GGML_OP_RESHAPE:
|
|
5659
|
-
case GGML_OP_VIEW:
|
|
5660
|
-
case GGML_OP_PERMUTE:
|
|
5661
|
-
case GGML_OP_TRANSPOSE:
|
|
5662
|
-
case GGML_OP_NONE:
|
|
5663
5651
|
break;
|
|
5664
5652
|
case GGML_OP_NORM:
|
|
5665
5653
|
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
|
@@ -5712,7 +5700,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5712
5700
|
return;
|
|
5713
5701
|
}
|
|
5714
5702
|
|
|
5715
|
-
extra->ready = true;
|
|
5716
5703
|
extra->ctx_idx = ctx->compute_ctx->idx;
|
|
5717
5704
|
|
|
5718
5705
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
@@ -5788,16 +5775,12 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5788
5775
|
return true;
|
|
5789
5776
|
}
|
|
5790
5777
|
|
|
5791
|
-
|
|
5792
|
-
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
|
|
5793
|
-
#endif
|
|
5778
|
+
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
|
|
5794
5779
|
|
|
5795
5780
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
5796
5781
|
ggml_vk_check_results_0(ctx, params, tensor);
|
|
5797
5782
|
#endif
|
|
5798
5783
|
|
|
5799
|
-
GGML_ASSERT(extra->ready);
|
|
5800
|
-
|
|
5801
5784
|
vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
|
|
5802
5785
|
|
|
5803
5786
|
// Only run if ctx hasn't been submitted yet
|
|
@@ -5822,16 +5805,12 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5822
5805
|
subctx.out_memcpys.clear();
|
|
5823
5806
|
}
|
|
5824
5807
|
|
|
5825
|
-
extra->ready = false;
|
|
5826
|
-
|
|
5827
5808
|
return true;
|
|
5828
5809
|
}
|
|
5829
5810
|
|
|
5830
5811
|
// Clean up after graph processing is done
|
|
5831
5812
|
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
5832
|
-
|
|
5833
|
-
std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
|
|
5834
|
-
#endif
|
|
5813
|
+
VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
|
|
5835
5814
|
for (auto& buffer : ctx->gc.temp_buffers) {
|
|
5836
5815
|
ggml_vk_pool_free(ctx, buffer);
|
|
5837
5816
|
}
|
|
@@ -5875,9 +5854,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
5875
5854
|
|
|
5876
5855
|
// Clean up on backend free
|
|
5877
5856
|
static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
5878
|
-
|
|
5879
|
-
std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl;
|
|
5880
|
-
#endif
|
|
5857
|
+
VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")");
|
|
5881
5858
|
ggml_vk_graph_cleanup(ctx);
|
|
5882
5859
|
|
|
5883
5860
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
|
@@ -5943,7 +5920,9 @@ struct ggml_backend_vk_buffer_context {
|
|
|
5943
5920
|
|
|
5944
5921
|
~ggml_backend_vk_buffer_context() {
|
|
5945
5922
|
ggml_vk_destroy_buffer(dev_buffer);
|
|
5946
|
-
|
|
5923
|
+
if (temp_tensor_extras != nullptr) {
|
|
5924
|
+
delete[] temp_tensor_extras;
|
|
5925
|
+
}
|
|
5947
5926
|
}
|
|
5948
5927
|
|
|
5949
5928
|
ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
|
|
@@ -5970,9 +5949,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
|
|
5970
5949
|
}
|
|
5971
5950
|
|
|
5972
5951
|
GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
5973
|
-
|
|
5974
|
-
std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl;
|
|
5975
|
-
#endif
|
|
5952
|
+
VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
|
|
5976
5953
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
5977
5954
|
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
|
5978
5955
|
delete ctx;
|
|
@@ -5985,49 +5962,41 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu
|
|
|
5985
5962
|
}
|
|
5986
5963
|
|
|
5987
5964
|
GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
5988
|
-
|
|
5989
|
-
std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl;
|
|
5990
|
-
#endif
|
|
5965
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
|
|
5991
5966
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
5992
5967
|
|
|
5993
|
-
|
|
5994
|
-
if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
|
|
5968
|
+
if (tensor->view_src != nullptr) {
|
|
5995
5969
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
|
5996
|
-
|
|
5997
|
-
extra
|
|
5998
|
-
extra->offset = extra_view->offset + tensor->view_offs;
|
|
5970
|
+
GGML_ASSERT(tensor->view_src->extra != nullptr);
|
|
5971
|
+
tensor->extra = tensor->view_src->extra;
|
|
5999
5972
|
} else {
|
|
5973
|
+
ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
|
|
6000
5974
|
extra->buffer_gpu = ctx->dev_buffer;
|
|
6001
5975
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
|
5976
|
+
tensor->extra = extra;
|
|
6002
5977
|
}
|
|
6003
|
-
|
|
6004
|
-
tensor->extra = extra;
|
|
6005
5978
|
}
|
|
6006
5979
|
|
|
6007
5980
|
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
6008
|
-
|
|
6009
|
-
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
|
6010
|
-
#endif
|
|
5981
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
|
6011
5982
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
6012
5983
|
|
|
6013
5984
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
6014
5985
|
|
|
6015
5986
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
6016
5987
|
|
|
6017
|
-
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
|
|
5988
|
+
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6018
5989
|
}
|
|
6019
5990
|
|
|
6020
5991
|
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
6021
|
-
|
|
6022
|
-
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
|
6023
|
-
#endif
|
|
5992
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
|
6024
5993
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
6025
5994
|
|
|
6026
5995
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
6027
5996
|
|
|
6028
5997
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
6029
5998
|
|
|
6030
|
-
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
|
|
5999
|
+
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6031
6000
|
}
|
|
6032
6001
|
|
|
6033
6002
|
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
|
@@ -6038,7 +6007,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|
|
6038
6007
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
|
6039
6008
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
|
6040
6009
|
|
|
6041
|
-
ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
|
6010
|
+
ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
|
|
6042
6011
|
|
|
6043
6012
|
return true;
|
|
6044
6013
|
}
|
|
@@ -6078,11 +6047,15 @@ GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buff
|
|
|
6078
6047
|
}
|
|
6079
6048
|
|
|
6080
6049
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
6081
|
-
|
|
6082
|
-
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
|
6083
|
-
#endif
|
|
6050
|
+
VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
|
|
6084
6051
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
|
6085
|
-
|
|
6052
|
+
|
|
6053
|
+
vk_buffer dev_buffer = nullptr;
|
|
6054
|
+
try {
|
|
6055
|
+
dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
|
|
6056
|
+
} catch (const vk::SystemError& e) {
|
|
6057
|
+
return nullptr;
|
|
6058
|
+
}
|
|
6086
6059
|
|
|
6087
6060
|
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
|
|
6088
6061
|
|
|
@@ -6105,33 +6078,19 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
|
|
|
6105
6078
|
UNUSED(buft);
|
|
6106
6079
|
}
|
|
6107
6080
|
|
|
6108
|
-
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
6109
|
-
if (!ggml_backend_is_vk(backend)) {
|
|
6110
|
-
return false;
|
|
6111
|
-
}
|
|
6112
|
-
|
|
6113
|
-
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
|
6114
|
-
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6115
|
-
|
|
6116
|
-
return buft_ctx->ctx->idx == ctx->idx;
|
|
6117
|
-
}
|
|
6118
|
-
|
|
6119
6081
|
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
6120
6082
|
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
|
6121
6083
|
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
|
6122
6084
|
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
|
6123
6085
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
|
6124
6086
|
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
|
6125
|
-
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
|
|
6126
6087
|
/* .is_host = */ NULL,
|
|
6127
6088
|
};
|
|
6128
6089
|
|
|
6129
6090
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
|
6130
6091
|
ggml_vk_instance_init();
|
|
6131
6092
|
|
|
6132
|
-
|
|
6133
|
-
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
|
6134
|
-
#endif
|
|
6093
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
|
|
6135
6094
|
|
|
6136
6095
|
GGML_ASSERT(dev_num < vk_instance.device_indices.size());
|
|
6137
6096
|
|
|
@@ -6155,16 +6114,12 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff
|
|
|
6155
6114
|
}
|
|
6156
6115
|
|
|
6157
6116
|
GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
6158
|
-
|
|
6159
|
-
std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl;
|
|
6160
|
-
#endif
|
|
6117
|
+
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
|
6161
6118
|
ggml_vk_host_free(&vk_instance.contexts[0], buffer->context);
|
|
6162
6119
|
}
|
|
6163
6120
|
|
|
6164
6121
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
6165
|
-
|
|
6166
|
-
std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
|
6167
|
-
#endif
|
|
6122
|
+
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
|
|
6168
6123
|
size += 32; // Behave like the CPU buffer type
|
|
6169
6124
|
void * ptr = nullptr;
|
|
6170
6125
|
try {
|
|
@@ -6198,7 +6153,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
|
6198
6153
|
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
|
6199
6154
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
6200
6155
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
6201
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
6202
6156
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
6203
6157
|
},
|
|
6204
6158
|
/* .context = */ nullptr,
|
|
@@ -6222,9 +6176,7 @@ GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
|
|
|
6222
6176
|
|
|
6223
6177
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
|
|
6224
6178
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6225
|
-
|
|
6226
|
-
std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl;
|
|
6227
|
-
#endif
|
|
6179
|
+
VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
|
|
6228
6180
|
|
|
6229
6181
|
size_t idx = ctx->idx;
|
|
6230
6182
|
|
|
@@ -6248,9 +6200,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t
|
|
|
6248
6200
|
}
|
|
6249
6201
|
|
|
6250
6202
|
GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
6251
|
-
|
|
6252
|
-
std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
|
|
6253
|
-
#endif
|
|
6203
|
+
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
|
6254
6204
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6255
6205
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
6256
6206
|
|
|
@@ -6264,13 +6214,11 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
|
6264
6214
|
|
|
6265
6215
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
6266
6216
|
|
|
6267
|
-
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
|
|
6217
|
+
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6268
6218
|
}
|
|
6269
6219
|
|
|
6270
6220
|
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
6271
|
-
|
|
6272
|
-
std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
|
|
6273
|
-
#endif
|
|
6221
|
+
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
|
6274
6222
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6275
6223
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
6276
6224
|
|
|
@@ -6284,13 +6232,11 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
|
6284
6232
|
|
|
6285
6233
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
6286
6234
|
|
|
6287
|
-
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
|
|
6235
|
+
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6288
6236
|
}
|
|
6289
6237
|
|
|
6290
6238
|
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
|
6291
|
-
|
|
6292
|
-
std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
|
|
6293
|
-
#endif
|
|
6239
|
+
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
|
|
6294
6240
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6295
6241
|
if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
|
6296
6242
|
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
|
@@ -6305,7 +6251,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
|
6305
6251
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
|
6306
6252
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
|
6307
6253
|
|
|
6308
|
-
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
|
6254
|
+
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
|
|
6309
6255
|
return true;
|
|
6310
6256
|
}
|
|
6311
6257
|
|
|
@@ -6313,9 +6259,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
|
6313
6259
|
}
|
|
6314
6260
|
|
|
6315
6261
|
GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
6316
|
-
|
|
6317
|
-
std::cerr << "ggml_backend_vk_synchronize()" << std::endl;
|
|
6318
|
-
#endif
|
|
6262
|
+
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
|
|
6319
6263
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6320
6264
|
if(ctx->transfer_ctx == nullptr) {
|
|
6321
6265
|
return;
|
|
@@ -6343,9 +6287,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
|
|
|
6343
6287
|
}
|
|
6344
6288
|
|
|
6345
6289
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
6346
|
-
|
|
6347
|
-
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
|
6348
|
-
#endif
|
|
6290
|
+
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
|
6349
6291
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6350
6292
|
|
|
6351
6293
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
@@ -6402,7 +6344,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6402
6344
|
case GGML_UNARY_OP_GELU:
|
|
6403
6345
|
case GGML_UNARY_OP_SILU:
|
|
6404
6346
|
case GGML_UNARY_OP_RELU:
|
|
6405
|
-
return
|
|
6347
|
+
return ggml_is_contiguous(op->src[0]);
|
|
6406
6348
|
default:
|
|
6407
6349
|
return false;
|
|
6408
6350
|
}
|
|
@@ -6478,11 +6420,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6478
6420
|
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
|
6479
6421
|
// } break;
|
|
6480
6422
|
case GGML_OP_ROPE:
|
|
6481
|
-
|
|
6482
|
-
const int mode = ((const int32_t *) op->op_params)[2];
|
|
6483
|
-
|
|
6484
|
-
return true;
|
|
6485
|
-
} break;
|
|
6423
|
+
return ggml_is_contiguous(op->src[0]);
|
|
6486
6424
|
case GGML_OP_NONE:
|
|
6487
6425
|
case GGML_OP_RESHAPE:
|
|
6488
6426
|
case GGML_OP_VIEW:
|
|
@@ -6518,6 +6456,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
|
|
|
6518
6456
|
UNUSED(backend);
|
|
6519
6457
|
}
|
|
6520
6458
|
|
|
6459
|
+
GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
6460
|
+
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
|
6461
|
+
return false;
|
|
6462
|
+
}
|
|
6463
|
+
|
|
6464
|
+
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
|
6465
|
+
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6466
|
+
|
|
6467
|
+
return buft_ctx->ctx->idx == ctx->idx;
|
|
6468
|
+
}
|
|
6469
|
+
|
|
6521
6470
|
// TODO: enable async and synchronize
|
|
6522
6471
|
static ggml_backend_i ggml_backend_vk_interface = {
|
|
6523
6472
|
/* .get_name = */ ggml_backend_vk_name,
|
|
@@ -6529,9 +6478,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
|
6529
6478
|
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
|
6530
6479
|
/* .graph_plan_create = */ NULL,
|
|
6531
6480
|
/* .graph_plan_free = */ NULL,
|
|
6481
|
+
/* .graph_plan_update = */ NULL,
|
|
6532
6482
|
/* .graph_plan_compute = */ NULL,
|
|
6533
6483
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
|
6534
6484
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
|
6485
|
+
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
|
6535
6486
|
/* .offload_op = */ ggml_backend_vk_offload_op,
|
|
6536
6487
|
/* .event_new = */ NULL,
|
|
6537
6488
|
/* .event_free = */ NULL,
|
|
@@ -6549,9 +6500,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
|
|
6549
6500
|
if (vk_instance.initialized[dev_num]) {
|
|
6550
6501
|
return vk_instance.backends[dev_num];
|
|
6551
6502
|
}
|
|
6552
|
-
|
|
6553
|
-
std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
|
|
6554
|
-
#endif
|
|
6503
|
+
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
|
6555
6504
|
|
|
6556
6505
|
ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
|
|
6557
6506
|
ggml_vk_init(ctx, dev_num);
|
|
@@ -6725,7 +6674,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
|
6725
6674
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
6726
6675
|
|
|
6727
6676
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6728
|
-
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
|
6677
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
|
6729
6678
|
}
|
|
6730
6679
|
|
|
6731
6680
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
|
@@ -6767,9 +6716,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6767
6716
|
return;
|
|
6768
6717
|
}
|
|
6769
6718
|
|
|
6770
|
-
|
|
6771
|
-
std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
|
|
6772
|
-
#endif
|
|
6719
|
+
VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
|
|
6773
6720
|
|
|
6774
6721
|
ggml_tensor * src0 = tensor->src[0];
|
|
6775
6722
|
ggml_tensor * src1 = tensor->src[1];
|
|
@@ -6809,7 +6756,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6809
6756
|
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
|
6810
6757
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
6811
6758
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6812
|
-
uint64_t offset = extra->offset;
|
|
6759
|
+
uint64_t offset = extra->offset + src0->view_offs;
|
|
6813
6760
|
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
|
6814
6761
|
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
|
6815
6762
|
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
|
@@ -6851,7 +6798,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6851
6798
|
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
|
6852
6799
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
6853
6800
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6854
|
-
uint64_t offset = extra->offset;
|
|
6801
|
+
uint64_t offset = extra->offset + src1->view_offs;
|
|
6855
6802
|
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
|
6856
6803
|
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
|
6857
6804
|
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
|
@@ -6909,7 +6856,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6909
6856
|
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
|
6910
6857
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
|
6911
6858
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6912
|
-
uint64_t offset = extra->offset;
|
|
6859
|
+
uint64_t offset = extra->offset + src2->view_offs;
|
|
6913
6860
|
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
|
6914
6861
|
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
|
6915
6862
|
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
|
@@ -7075,9 +7022,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
7075
7022
|
return;
|
|
7076
7023
|
}
|
|
7077
7024
|
|
|
7078
|
-
|
|
7079
|
-
std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
|
|
7080
|
-
#endif
|
|
7025
|
+
VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")");
|
|
7081
7026
|
|
|
7082
7027
|
ggml_tensor * src0 = tensor->src[0];
|
|
7083
7028
|
ggml_tensor * src1 = tensor->src[1];
|
|
@@ -7092,11 +7037,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
7092
7037
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
7093
7038
|
|
|
7094
7039
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
7095
|
-
if (extra->offset + tensor_size >= buffer_gpu->size) {
|
|
7096
|
-
tensor_size = buffer_gpu->size - (extra->offset);
|
|
7040
|
+
if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
|
|
7041
|
+
tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
|
|
7097
7042
|
}
|
|
7098
7043
|
|
|
7099
|
-
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
|
7044
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
|
7100
7045
|
}
|
|
7101
7046
|
|
|
7102
7047
|
float first_error_result = -1.0f;
|