@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#include "ggml-vulkan.h"
|
|
2
2
|
#include <vulkan/vulkan_core.h>
|
|
3
|
-
#
|
|
3
|
+
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF)
|
|
4
4
|
#include <chrono>
|
|
5
5
|
#endif
|
|
6
6
|
|
|
@@ -17,10 +17,13 @@
|
|
|
17
17
|
#include <memory>
|
|
18
18
|
#include <limits>
|
|
19
19
|
#include <map>
|
|
20
|
+
#include <unordered_map>
|
|
20
21
|
#include <memory>
|
|
21
22
|
#include <mutex>
|
|
23
|
+
#include <future>
|
|
24
|
+
#include <thread>
|
|
22
25
|
|
|
23
|
-
#include "ggml.h"
|
|
26
|
+
#include "ggml-impl.h"
|
|
24
27
|
#include "ggml-backend-impl.h"
|
|
25
28
|
|
|
26
29
|
#include "ggml-vulkan-shaders.hpp"
|
|
@@ -34,9 +37,7 @@
|
|
|
34
37
|
#define VK_VENDOR_ID_INTEL 0x8086
|
|
35
38
|
#define VK_VENDOR_ID_NVIDIA 0x10de
|
|
36
39
|
|
|
37
|
-
#define
|
|
38
|
-
#define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
|
|
39
|
-
#define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
|
|
40
|
+
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32
|
|
40
41
|
|
|
41
42
|
#define GGML_VK_MAX_NODES 8192
|
|
42
43
|
|
|
@@ -74,6 +75,8 @@ struct vk_queue {
|
|
|
74
75
|
std::vector<vk::CommandBuffer> cmd_buffers;
|
|
75
76
|
|
|
76
77
|
vk::PipelineStageFlags stage_flags;
|
|
78
|
+
|
|
79
|
+
bool transfer_only;
|
|
77
80
|
};
|
|
78
81
|
|
|
79
82
|
struct vk_pipeline_struct {
|
|
@@ -116,11 +119,11 @@ struct ggml_backend_vk_buffer_type_context {
|
|
|
116
119
|
vk_device device;
|
|
117
120
|
};
|
|
118
121
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
122
|
+
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
|
|
123
|
+
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
|
124
|
+
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
|
|
125
|
+
static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft);
|
|
126
|
+
static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor);
|
|
124
127
|
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
125
128
|
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
|
126
129
|
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
|
@@ -133,6 +136,9 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
|
133
136
|
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
134
137
|
class vk_memory_logger;
|
|
135
138
|
#endif
|
|
139
|
+
#ifdef GGML_VULKAN_PERF
|
|
140
|
+
class vk_perf_logger;
|
|
141
|
+
#endif
|
|
136
142
|
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
|
137
143
|
|
|
138
144
|
struct vk_device_struct {
|
|
@@ -148,7 +154,6 @@ struct vk_device_struct {
|
|
|
148
154
|
vk_queue compute_queue;
|
|
149
155
|
vk_queue transfer_queue;
|
|
150
156
|
bool single_queue;
|
|
151
|
-
uint32_t descriptor_set_mode;
|
|
152
157
|
uint32_t subgroup_size;
|
|
153
158
|
bool uma;
|
|
154
159
|
|
|
@@ -177,26 +182,40 @@ struct vk_device_struct {
|
|
|
177
182
|
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
|
|
178
183
|
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
|
|
179
184
|
vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
|
|
185
|
+
vk_pipeline pipeline_acc_f32;
|
|
186
|
+
vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16;
|
|
180
187
|
vk_pipeline pipeline_mul_f32;
|
|
181
188
|
vk_pipeline pipeline_div_f32;
|
|
182
|
-
vk_pipeline
|
|
189
|
+
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
|
|
190
|
+
vk_pipeline pipeline_upscale_f32;
|
|
183
191
|
vk_pipeline pipeline_scale_f32;
|
|
184
192
|
vk_pipeline pipeline_sqr_f32;
|
|
193
|
+
vk_pipeline pipeline_sin_f32;
|
|
194
|
+
vk_pipeline pipeline_cos_f32;
|
|
185
195
|
vk_pipeline pipeline_clamp_f32;
|
|
196
|
+
vk_pipeline pipeline_pad_f32;
|
|
197
|
+
vk_pipeline pipeline_repeat_f32;
|
|
186
198
|
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
|
|
187
199
|
vk_pipeline pipeline_norm_f32;
|
|
200
|
+
vk_pipeline pipeline_group_norm_f32;
|
|
188
201
|
vk_pipeline pipeline_rms_norm_f32;
|
|
189
202
|
vk_pipeline pipeline_gelu_f32;
|
|
203
|
+
vk_pipeline pipeline_gelu_quick_f32;
|
|
190
204
|
vk_pipeline pipeline_silu_f32;
|
|
191
205
|
vk_pipeline pipeline_relu_f32;
|
|
206
|
+
vk_pipeline pipeline_leaky_relu_f32;
|
|
207
|
+
vk_pipeline pipeline_tanh_f32;
|
|
192
208
|
vk_pipeline pipeline_diag_mask_inf_f32;
|
|
193
209
|
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
|
194
210
|
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
|
|
195
211
|
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
|
196
212
|
vk_pipeline pipeline_argsort_f32;
|
|
197
213
|
vk_pipeline pipeline_sum_rows_f32;
|
|
214
|
+
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
|
215
|
+
vk_pipeline pipeline_timestep_embedding_f32;
|
|
198
216
|
|
|
199
|
-
std::
|
|
217
|
+
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
|
218
|
+
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
|
200
219
|
|
|
201
220
|
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
|
202
221
|
|
|
@@ -208,6 +227,9 @@ struct vk_device_struct {
|
|
|
208
227
|
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
209
228
|
std::unique_ptr<vk_memory_logger> memory_logger;
|
|
210
229
|
#endif
|
|
230
|
+
#ifdef GGML_VULKAN_PERF
|
|
231
|
+
std::unique_ptr<vk_perf_logger> perf_logger;
|
|
232
|
+
#endif
|
|
211
233
|
|
|
212
234
|
~vk_device_struct() {
|
|
213
235
|
VK_LOG_DEBUG("destroy device " << name);
|
|
@@ -222,11 +244,11 @@ struct vk_device_struct {
|
|
|
222
244
|
}
|
|
223
245
|
|
|
224
246
|
for (auto& pipeline : pipelines) {
|
|
225
|
-
if (pipeline.expired()) {
|
|
247
|
+
if (pipeline.second.expired()) {
|
|
226
248
|
continue;
|
|
227
249
|
}
|
|
228
250
|
|
|
229
|
-
vk_pipeline pl = pipeline.lock();
|
|
251
|
+
vk_pipeline pl = pipeline.second.lock();
|
|
230
252
|
ggml_vk_destroy_pipeline(device, pl);
|
|
231
253
|
}
|
|
232
254
|
pipelines.clear();
|
|
@@ -259,6 +281,10 @@ struct vk_subbuffer {
|
|
|
259
281
|
vk_buffer buffer;
|
|
260
282
|
uint64_t offset;
|
|
261
283
|
uint64_t size;
|
|
284
|
+
|
|
285
|
+
operator vk::DescriptorBufferInfo() const {
|
|
286
|
+
return { buffer->buffer, offset, size };
|
|
287
|
+
}
|
|
262
288
|
};
|
|
263
289
|
|
|
264
290
|
struct vk_semaphore {
|
|
@@ -320,7 +346,7 @@ struct vk_op_binary_push_constants {
|
|
|
320
346
|
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
|
321
347
|
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
|
|
322
348
|
uint32_t d_offset;
|
|
323
|
-
float param1; float param2;
|
|
349
|
+
float param1; float param2; int32_t param3;
|
|
324
350
|
};
|
|
325
351
|
|
|
326
352
|
struct vk_op_diag_mask_push_constants {
|
|
@@ -358,6 +384,25 @@ struct vk_op_argsort_push_constants {
|
|
|
358
384
|
int32_t order;
|
|
359
385
|
};
|
|
360
386
|
|
|
387
|
+
struct vk_op_im2col_push_constants {
|
|
388
|
+
uint32_t batch_offset; uint32_t offset_delta;
|
|
389
|
+
uint32_t IC;
|
|
390
|
+
uint32_t IW; uint32_t IH;
|
|
391
|
+
uint32_t OW; uint32_t OH;
|
|
392
|
+
uint32_t KW; uint32_t KH;
|
|
393
|
+
uint32_t pelements;
|
|
394
|
+
uint32_t CHW;
|
|
395
|
+
int32_t s0; int32_t s1;
|
|
396
|
+
int32_t p0; int32_t p1;
|
|
397
|
+
int32_t d0; int32_t d1;
|
|
398
|
+
};
|
|
399
|
+
|
|
400
|
+
struct vk_op_timestep_embedding_push_constants {
|
|
401
|
+
uint32_t nb1;
|
|
402
|
+
uint32_t dim;
|
|
403
|
+
uint32_t max_period;
|
|
404
|
+
};
|
|
405
|
+
|
|
361
406
|
// Allow pre-recording command buffers
|
|
362
407
|
struct vk_staging_memcpy {
|
|
363
408
|
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
|
@@ -367,32 +412,26 @@ struct vk_staging_memcpy {
|
|
|
367
412
|
size_t n;
|
|
368
413
|
};
|
|
369
414
|
|
|
370
|
-
struct
|
|
371
|
-
|
|
415
|
+
struct vk_op_upscale_push_constants {
|
|
416
|
+
uint32_t ne; uint32_t d_offset;
|
|
417
|
+
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
|
418
|
+
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
|
419
|
+
float sf0; float sf1; float sf2; float sf3;
|
|
420
|
+
};
|
|
372
421
|
|
|
422
|
+
struct vk_context_struct {
|
|
373
423
|
vk_submission * s;
|
|
374
424
|
std::vector<vk_sequence> seqs;
|
|
375
425
|
|
|
376
|
-
|
|
426
|
+
int exit_tensor_idx;
|
|
377
427
|
|
|
378
428
|
std::vector<vk_staging_memcpy> in_memcpys;
|
|
379
429
|
std::vector<vk_staging_memcpy> out_memcpys;
|
|
380
430
|
|
|
381
431
|
vk_queue * q;
|
|
382
432
|
};
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
size_t ctx_idx;
|
|
386
|
-
|
|
387
|
-
vk_buffer_ref buffer_gpu;
|
|
388
|
-
uint64_t offset;
|
|
389
|
-
|
|
390
|
-
void reset() {
|
|
391
|
-
ctx_idx = 0;
|
|
392
|
-
buffer_gpu.reset();
|
|
393
|
-
offset = 0;
|
|
394
|
-
}
|
|
395
|
-
};
|
|
433
|
+
typedef std::shared_ptr<vk_context_struct> vk_context;
|
|
434
|
+
typedef std::weak_ptr<vk_context_struct> vk_context_ref;
|
|
396
435
|
|
|
397
436
|
struct ggml_vk_garbage_collector {
|
|
398
437
|
std::vector<vk_semaphore> tl_semaphores;
|
|
@@ -443,6 +482,48 @@ private:
|
|
|
443
482
|
#define VK_LOG_MEMORY(msg) ((void) 0)
|
|
444
483
|
#endif // GGML_VULKAN_MEMORY_DEBUG
|
|
445
484
|
|
|
485
|
+
#if defined(GGML_VULKAN_PERF)
|
|
486
|
+
|
|
487
|
+
class vk_perf_logger {
|
|
488
|
+
public:
|
|
489
|
+
void print_timings() {
|
|
490
|
+
std::cerr << "----------------\nVulkan Timings:" << std::endl;
|
|
491
|
+
for (const auto& t : timings) {
|
|
492
|
+
uint64_t total = 0;
|
|
493
|
+
for (const auto& time : t.second) {
|
|
494
|
+
total += time;
|
|
495
|
+
}
|
|
496
|
+
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
timings.clear();
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
void log_timing(const ggml_tensor * node, uint64_t time) {
|
|
503
|
+
if (node->op == GGML_OP_UNARY) {
|
|
504
|
+
timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
|
|
505
|
+
return;
|
|
506
|
+
}
|
|
507
|
+
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
|
508
|
+
const uint64_t m = node->src[0]->ne[1];
|
|
509
|
+
const uint64_t n = node->src[1]->ne[1];
|
|
510
|
+
const uint64_t k = node->src[1]->ne[0];
|
|
511
|
+
std::string name = ggml_op_name(node->op);
|
|
512
|
+
if (n == 1) {
|
|
513
|
+
name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k);
|
|
514
|
+
} else {
|
|
515
|
+
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
|
|
516
|
+
}
|
|
517
|
+
timings[name].push_back(time);
|
|
518
|
+
return;
|
|
519
|
+
}
|
|
520
|
+
timings[ggml_op_name(node->op)].push_back(time);
|
|
521
|
+
}
|
|
522
|
+
private:
|
|
523
|
+
std::map<std::string, std::vector<uint64_t>> timings;
|
|
524
|
+
};
|
|
525
|
+
#endif // GGML_VULKAN_PERF
|
|
526
|
+
|
|
446
527
|
struct ggml_backend_vk_context {
|
|
447
528
|
std::string name;
|
|
448
529
|
|
|
@@ -453,14 +534,38 @@ struct ggml_backend_vk_context {
|
|
|
453
534
|
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
|
|
454
535
|
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
|
|
455
536
|
vk::Fence fence;
|
|
456
|
-
vk_buffer staging;
|
|
457
|
-
size_t staging_size;
|
|
458
|
-
size_t staging_offset;
|
|
459
537
|
|
|
460
538
|
vk_buffer buffer_pool[MAX_VK_BUFFERS];
|
|
461
539
|
|
|
462
|
-
|
|
463
|
-
|
|
540
|
+
vk_context_ref compute_ctx;
|
|
541
|
+
vk_context_ref transfer_ctx;
|
|
542
|
+
|
|
543
|
+
std::vector<vk_context_ref> tensor_ctxs;
|
|
544
|
+
};
|
|
545
|
+
|
|
546
|
+
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
|
|
547
|
+
|
|
548
|
+
static uint64_t vk_tensor_offset(const ggml_tensor * tensor) {
|
|
549
|
+
if (tensor->view_src) {
|
|
550
|
+
return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base;
|
|
551
|
+
}
|
|
552
|
+
return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
struct ggml_backend_vk_buffer_context {
|
|
556
|
+
vk_device_ref device;
|
|
557
|
+
vk_buffer dev_buffer;
|
|
558
|
+
std::string name;
|
|
559
|
+
|
|
560
|
+
ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) :
|
|
561
|
+
device(device),
|
|
562
|
+
dev_buffer(dev_buffer),
|
|
563
|
+
name(name) {
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
~ggml_backend_vk_buffer_context() {
|
|
567
|
+
ggml_vk_destroy_buffer(dev_buffer);
|
|
568
|
+
}
|
|
464
569
|
};
|
|
465
570
|
|
|
466
571
|
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
@@ -510,22 +615,25 @@ static vk_instance_t vk_instance;
|
|
|
510
615
|
static size_t vk_skip_checks;
|
|
511
616
|
static size_t vk_output_tensor;
|
|
512
617
|
|
|
513
|
-
static void ggml_vk_print_tensor(
|
|
514
|
-
static void ggml_vk_check_results_0(
|
|
515
|
-
static void ggml_vk_check_results_1(
|
|
618
|
+
static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
|
|
619
|
+
static void ggml_vk_check_results_0(ggml_tensor * tensor);
|
|
620
|
+
static void ggml_vk_check_results_1(ggml_tensor * tensor);
|
|
516
621
|
#endif
|
|
517
622
|
|
|
518
|
-
typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context
|
|
623
|
+
typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
|
519
624
|
|
|
520
|
-
|
|
625
|
+
static void ggml_backend_vk_free(ggml_backend_t backend);
|
|
521
626
|
|
|
522
|
-
|
|
627
|
+
// variables to track number of compiles in progress
|
|
628
|
+
static uint32_t compile_count = 0;
|
|
629
|
+
static std::mutex compile_count_mutex;
|
|
630
|
+
static std::condition_variable compile_count_cond;
|
|
631
|
+
|
|
632
|
+
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align) {
|
|
523
633
|
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
|
524
634
|
GGML_ASSERT(parameter_count > 0);
|
|
525
635
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
|
526
636
|
|
|
527
|
-
std::lock_guard<std::mutex> guard(device->mutex);
|
|
528
|
-
|
|
529
637
|
pipeline = std::make_shared<vk_pipeline_struct>();
|
|
530
638
|
pipeline->name = name;
|
|
531
639
|
pipeline->parameter_count = parameter_count;
|
|
@@ -557,35 +665,9 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
|
|
|
557
665
|
descriptor_set_layout_create_info.setPNext(&dslbfci);
|
|
558
666
|
pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
|
|
559
667
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
// Try allocating multiple sets from one pool
|
|
565
|
-
// This fails on AMD for some reason, so add a fall back to allocating one pool per set
|
|
566
|
-
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
|
|
567
|
-
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size);
|
|
568
|
-
vk::DescriptorPool pool = device->device.createDescriptorPool(descriptor_pool_create_info);
|
|
569
|
-
|
|
570
|
-
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
|
571
|
-
for (uint32_t i = 0; i < alloc_count; i++) {
|
|
572
|
-
layouts[i] = pipeline->dsl;
|
|
573
|
-
}
|
|
574
|
-
try {
|
|
575
|
-
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data());
|
|
576
|
-
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
|
577
|
-
} catch(vk::OutOfPoolMemoryError const&) {
|
|
578
|
-
device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE;
|
|
579
|
-
}
|
|
580
|
-
|
|
581
|
-
device->device.destroyDescriptorPool(pool);
|
|
582
|
-
}
|
|
583
|
-
|
|
584
|
-
if (device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
|
|
585
|
-
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
|
|
586
|
-
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size);
|
|
587
|
-
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
|
588
|
-
}
|
|
668
|
+
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
|
669
|
+
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
|
670
|
+
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
|
589
671
|
|
|
590
672
|
pipeline->descriptor_set_idx = 0;
|
|
591
673
|
|
|
@@ -619,7 +701,17 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
|
|
|
619
701
|
pipeline->layout);
|
|
620
702
|
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
|
621
703
|
|
|
622
|
-
|
|
704
|
+
{
|
|
705
|
+
std::lock_guard<std::mutex> guard(device->mutex);
|
|
706
|
+
device->pipelines.insert({ pipeline->name, pipeline });
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
{
|
|
710
|
+
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
|
711
|
+
assert(compile_count > 0);
|
|
712
|
+
compile_count--;
|
|
713
|
+
}
|
|
714
|
+
compile_count_cond.notify_all();
|
|
623
715
|
}
|
|
624
716
|
|
|
625
717
|
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
|
@@ -640,34 +732,49 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
|
|
640
732
|
device.destroyPipeline(pipeline->pipeline);
|
|
641
733
|
}
|
|
642
734
|
|
|
643
|
-
static void
|
|
644
|
-
VK_LOG_DEBUG("
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
return;
|
|
648
|
-
}
|
|
735
|
+
static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
|
|
736
|
+
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
|
737
|
+
device->pipeline_descriptor_set_requirements[pipeline->name] += n;
|
|
738
|
+
}
|
|
649
739
|
|
|
740
|
+
static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
|
|
650
741
|
std::lock_guard<std::mutex> guard(device->mutex);
|
|
651
742
|
|
|
652
|
-
|
|
653
|
-
|
|
743
|
+
for (auto& pair : device->pipeline_descriptor_set_requirements) {
|
|
744
|
+
vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
|
|
745
|
+
const uint64_t n = pair.second;
|
|
654
746
|
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
747
|
+
VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
|
748
|
+
|
|
749
|
+
if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
|
|
750
|
+
// Enough descriptors are available
|
|
751
|
+
continue;
|
|
658
752
|
}
|
|
659
|
-
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[0], alloc_count, layouts.data());
|
|
660
|
-
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
|
661
|
-
pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
|
|
662
|
-
} else {
|
|
663
|
-
for (uint32_t i = pipeline->descriptor_sets.size(); i < pipeline->descriptor_set_idx + n; i++) {
|
|
664
|
-
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
|
|
665
|
-
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size);
|
|
666
|
-
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
|
667
753
|
|
|
668
|
-
|
|
754
|
+
uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
|
|
755
|
+
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
756
|
+
uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
757
|
+
|
|
758
|
+
while (to_alloc > 0) {
|
|
759
|
+
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
|
|
760
|
+
to_alloc -= alloc_count;
|
|
761
|
+
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
762
|
+
|
|
763
|
+
if (pool_idx >= pipeline->descriptor_pools.size()) {
|
|
764
|
+
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
|
765
|
+
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
|
766
|
+
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
|
770
|
+
for (uint32_t i = 0; i < alloc_count; i++) {
|
|
771
|
+
layouts[i] = pipeline->dsl;
|
|
772
|
+
}
|
|
773
|
+
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data());
|
|
669
774
|
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
|
670
|
-
pipeline->descriptor_sets.
|
|
775
|
+
pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
|
|
776
|
+
|
|
777
|
+
pool_idx++;
|
|
671
778
|
}
|
|
672
779
|
}
|
|
673
780
|
}
|
|
@@ -708,11 +815,14 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
|
|
|
708
815
|
return s;
|
|
709
816
|
}
|
|
710
817
|
|
|
711
|
-
static void ggml_vk_submit(vk_context
|
|
712
|
-
VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
|
|
818
|
+
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
|
713
819
|
if (ctx->seqs.empty()) {
|
|
820
|
+
if (fence) {
|
|
821
|
+
ctx->q->queue.submit({}, fence);
|
|
822
|
+
}
|
|
714
823
|
return;
|
|
715
824
|
}
|
|
825
|
+
VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
|
|
716
826
|
|
|
717
827
|
std::vector<std::vector<uint64_t>> tl_wait_vals;
|
|
718
828
|
std::vector<std::vector<uint64_t>> tl_signal_vals;
|
|
@@ -828,11 +938,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
|
|
828
938
|
abort();
|
|
829
939
|
}
|
|
830
940
|
|
|
831
|
-
static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
|
|
941
|
+
static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only) {
|
|
832
942
|
VK_LOG_DEBUG("ggml_vk_create_queue()");
|
|
833
943
|
std::lock_guard<std::mutex> guard(device->mutex);
|
|
834
944
|
|
|
835
945
|
q.queue_family_index = queue_family_index;
|
|
946
|
+
q.transfer_only = transfer_only;
|
|
836
947
|
|
|
837
948
|
vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
|
|
838
949
|
q.pool = device->device.createCommandPool(command_pool_create_info_compute);
|
|
@@ -844,21 +955,17 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
|
|
|
844
955
|
q.stage_flags = stage_flags;
|
|
845
956
|
}
|
|
846
957
|
|
|
847
|
-
static vk_context
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
memset((void *) result, 0, sizeof(vk_context));
|
|
852
|
-
result->idx = ctx->gc.contexts.size() - 1;
|
|
958
|
+
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
|
|
959
|
+
vk_context result = std::make_shared<vk_context_struct>();
|
|
960
|
+
VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
|
|
961
|
+
ctx->gc.contexts.emplace_back(result);
|
|
853
962
|
result->q = &q;
|
|
854
963
|
return result;
|
|
855
964
|
}
|
|
856
965
|
|
|
857
|
-
static vk_context
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
memset((void *) result, 0, sizeof(vk_context));
|
|
861
|
-
result->idx = 0;
|
|
966
|
+
static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
|
|
967
|
+
vk_context result = std::make_shared<vk_context_struct>();
|
|
968
|
+
VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
|
|
862
969
|
result->q = &q;
|
|
863
970
|
return result;
|
|
864
971
|
}
|
|
@@ -915,6 +1022,10 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|
|
915
1022
|
|
|
916
1023
|
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
|
917
1024
|
VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
|
|
1025
|
+
if (size > device->max_memory_allocation_size) {
|
|
1026
|
+
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
|
|
1027
|
+
}
|
|
1028
|
+
|
|
918
1029
|
std::lock_guard<std::mutex> guard(device->mutex);
|
|
919
1030
|
|
|
920
1031
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
|
@@ -959,10 +1070,25 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|
|
959
1070
|
try {
|
|
960
1071
|
buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index });
|
|
961
1072
|
} catch (const vk::SystemError& e) {
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
1073
|
+
if (buf->memory_property_flags != fallback_flags) {
|
|
1074
|
+
// Try again with fallback flags
|
|
1075
|
+
memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
|
|
1076
|
+
buf->memory_property_flags = fallback_flags;
|
|
1077
|
+
|
|
1078
|
+
try {
|
|
1079
|
+
buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index });
|
|
1080
|
+
}
|
|
1081
|
+
catch (const vk::SystemError& e) {
|
|
1082
|
+
device->device.destroyBuffer(buf->buffer);
|
|
1083
|
+
buf->size = 0;
|
|
1084
|
+
throw e;
|
|
1085
|
+
}
|
|
1086
|
+
} else {
|
|
1087
|
+
// Out of Host/Device memory, clean up buffer
|
|
1088
|
+
device->device.destroyBuffer(buf->buffer);
|
|
1089
|
+
buf->size = 0;
|
|
1090
|
+
throw e;
|
|
1091
|
+
}
|
|
966
1092
|
}
|
|
967
1093
|
buf->ptr = nullptr;
|
|
968
1094
|
|
|
@@ -998,7 +1124,8 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
|
|
|
998
1124
|
// Fall back to host memory type
|
|
999
1125
|
buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
1000
1126
|
} else {
|
|
1001
|
-
|
|
1127
|
+
// use rebar if available, otherwise fallback to device only visible memory
|
|
1128
|
+
buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
1002
1129
|
}
|
|
1003
1130
|
} catch (const vk::SystemError& e) {
|
|
1004
1131
|
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
|
@@ -1027,21 +1154,25 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
|
|
1027
1154
|
return { buf, 0, VK_WHOLE_SIZE };
|
|
1028
1155
|
}
|
|
1029
1156
|
|
|
1030
|
-
static void ggml_vk_sync_buffers(vk_context
|
|
1157
|
+
static void ggml_vk_sync_buffers(vk_context& ctx) {
|
|
1031
1158
|
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
|
1032
|
-
|
|
1159
|
+
|
|
1160
|
+
const bool transfer_queue = ctx->q->transfer_only;
|
|
1033
1161
|
|
|
1034
1162
|
ctx->s->buffer.pipelineBarrier(
|
|
1035
1163
|
ctx->q->stage_flags,
|
|
1036
1164
|
ctx->q->stage_flags,
|
|
1037
1165
|
{},
|
|
1038
|
-
|
|
1166
|
+
{ {
|
|
1167
|
+
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
|
|
1168
|
+
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }
|
|
1169
|
+
} },
|
|
1039
1170
|
{},
|
|
1040
1171
|
{}
|
|
1041
1172
|
);
|
|
1042
1173
|
}
|
|
1043
1174
|
|
|
1044
|
-
static void ggml_vk_wait_events(vk_context
|
|
1175
|
+
static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) {
|
|
1045
1176
|
VK_LOG_DEBUG("ggml_vk_wait_events()");
|
|
1046
1177
|
if (events.empty()) {
|
|
1047
1178
|
return;
|
|
@@ -1063,11 +1194,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
1063
1194
|
// mulmat
|
|
1064
1195
|
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
|
|
1065
1196
|
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
|
|
1066
|
-
std::initializer_list<uint32_t> warptile_s = { device->subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
|
|
1197
|
+
std::initializer_list<uint32_t> warptile_s = { std::max(device->subgroup_size, 16u), 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
|
|
1067
1198
|
|
|
1068
1199
|
std::initializer_list<uint32_t> warptile_mmq_l = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
|
|
1069
1200
|
std::initializer_list<uint32_t> warptile_mmq_m = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
|
|
1070
|
-
std::initializer_list<uint32_t> warptile_mmq_s = { device->subgroup_size, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
|
|
1201
|
+
std::initializer_list<uint32_t> warptile_mmq_s = { std::max(device->subgroup_size, 16u), 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
|
|
1071
1202
|
|
|
1072
1203
|
std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
|
|
1073
1204
|
std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 };
|
|
@@ -1108,6 +1239,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
1108
1239
|
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1109
1240
|
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1110
1241
|
|
|
1242
|
+
std::vector<std::future<void>> compiles;
|
|
1243
|
+
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
|
|
1244
|
+
{
|
|
1245
|
+
// wait until fewer than N compiles are in progress
|
|
1246
|
+
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
|
1247
|
+
std::unique_lock<std::mutex> guard(compile_count_mutex);
|
|
1248
|
+
while (compile_count >= N) {
|
|
1249
|
+
compile_count_cond.wait(guard);
|
|
1250
|
+
}
|
|
1251
|
+
compile_count++;
|
|
1252
|
+
}
|
|
1253
|
+
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align));
|
|
1254
|
+
};
|
|
1255
|
+
|
|
1111
1256
|
if (device->fp16) {
|
|
1112
1257
|
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1113
1258
|
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
|
@@ -1598,6 +1743,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
1598
1743
|
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
|
1599
1744
|
|
|
1600
1745
|
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
|
1746
|
+
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
|
1601
1747
|
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
|
1602
1748
|
|
|
1603
1749
|
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
@@ -1605,20 +1751,37 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
1605
1751
|
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
1606
1752
|
|
|
1607
1753
|
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
1754
|
+
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
1608
1755
|
|
|
1609
|
-
ggml_vk_create_pipeline(device, device->
|
|
1756
|
+
ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
1610
1757
|
|
|
1758
|
+
ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
1611
1759
|
ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
1612
1760
|
|
|
1761
|
+
ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
1762
|
+
ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
1763
|
+
ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
1764
|
+
|
|
1765
|
+
ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);
|
|
1766
|
+
|
|
1613
1767
|
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
1614
1768
|
|
|
1615
1769
|
ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
1770
|
+
ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
1771
|
+
ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
1616
1772
|
|
|
1617
1773
|
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
1618
1774
|
|
|
1775
|
+
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
1776
|
+
|
|
1777
|
+
ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
1778
|
+
|
|
1619
1779
|
ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
1780
|
+
ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
1620
1781
|
ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
1621
1782
|
ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
1783
|
+
ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
1784
|
+
ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
|
1622
1785
|
|
|
1623
1786
|
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
|
|
1624
1787
|
|
|
@@ -1634,6 +1797,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
1634
1797
|
ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
|
1635
1798
|
|
|
1636
1799
|
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1800
|
+
|
|
1801
|
+
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
|
1802
|
+
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
|
1803
|
+
|
|
1804
|
+
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
|
1805
|
+
|
|
1806
|
+
for (auto &c : compiles) {
|
|
1807
|
+
c.wait();
|
|
1808
|
+
}
|
|
1637
1809
|
}
|
|
1638
1810
|
|
|
1639
1811
|
static vk_device ggml_vk_get_device(size_t idx) {
|
|
@@ -1647,6 +1819,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
1647
1819
|
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
1648
1820
|
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
|
|
1649
1821
|
#endif
|
|
1822
|
+
#ifdef GGML_VULKAN_PERF
|
|
1823
|
+
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
|
|
1824
|
+
#endif
|
|
1650
1825
|
|
|
1651
1826
|
size_t dev_num = vk_instance.device_indices[idx];
|
|
1652
1827
|
|
|
@@ -1777,17 +1952,15 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
1777
1952
|
device_create_info.setPNext(&device_features2);
|
|
1778
1953
|
device->device = device->physical_device.createDevice(device_create_info);
|
|
1779
1954
|
|
|
1780
|
-
device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN;
|
|
1781
|
-
|
|
1782
1955
|
// Queues
|
|
1783
|
-
ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer });
|
|
1956
|
+
ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false);
|
|
1784
1957
|
|
|
1785
1958
|
// Shaders
|
|
1786
1959
|
ggml_vk_load_shaders(device);
|
|
1787
1960
|
|
|
1788
1961
|
if (!device->single_queue) {
|
|
1789
1962
|
const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
|
|
1790
|
-
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer });
|
|
1963
|
+
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
|
|
1791
1964
|
} else {
|
|
1792
1965
|
// TODO: Use pointer or reference to avoid copy
|
|
1793
1966
|
device->transfer_queue = device->compute_queue;
|
|
@@ -1795,6 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
1795
1968
|
|
|
1796
1969
|
device->buffer_type = {
|
|
1797
1970
|
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
|
1971
|
+
/* .device = */ nullptr,
|
|
1798
1972
|
/* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
|
|
1799
1973
|
};
|
|
1800
1974
|
|
|
@@ -2057,9 +2231,9 @@ void ggml_vk_instance_init() {
|
|
|
2057
2231
|
}
|
|
2058
2232
|
|
|
2059
2233
|
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
2060
|
-
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
|
2061
2234
|
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
|
|
2062
2235
|
ggml_vk_instance_init();
|
|
2236
|
+
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
|
2063
2237
|
|
|
2064
2238
|
ctx->name = GGML_VK_NAME + std::to_string(idx);
|
|
2065
2239
|
|
|
@@ -2074,12 +2248,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
|
2074
2248
|
|
|
2075
2249
|
ctx->fence = ctx->device->device.createFence({});
|
|
2076
2250
|
|
|
2077
|
-
ctx->staging_size = 0;
|
|
2078
|
-
ctx->staging_offset = 0;
|
|
2079
|
-
|
|
2080
|
-
ctx->compute_ctx = nullptr;
|
|
2081
|
-
ctx->transfer_ctx = nullptr;
|
|
2082
|
-
|
|
2083
2251
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
2084
2252
|
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
|
|
2085
2253
|
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
|
|
@@ -2112,7 +2280,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
|
|
2112
2280
|
}
|
|
2113
2281
|
|
|
2114
2282
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
|
2115
|
-
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
|
|
2283
|
+
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
|
|
2116
2284
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
|
2117
2285
|
return ctx->device->pipeline_matmul_f32;
|
|
2118
2286
|
}
|
|
@@ -2126,7 +2294,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
|
2126
2294
|
return ctx->device->pipeline_matmul_f16;
|
|
2127
2295
|
}
|
|
2128
2296
|
|
|
2129
|
-
|
|
2297
|
+
if (src1_type != GGML_TYPE_F32) {
|
|
2298
|
+
return nullptr;
|
|
2299
|
+
}
|
|
2130
2300
|
|
|
2131
2301
|
switch (src0_type) {
|
|
2132
2302
|
case GGML_TYPE_Q4_0:
|
|
@@ -2370,28 +2540,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
|
|
|
2370
2540
|
return s;
|
|
2371
2541
|
}
|
|
2372
2542
|
|
|
2373
|
-
|
|
2543
|
+
|
|
2544
|
+
|
|
2545
|
+
static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
|
|
2374
2546
|
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
|
2375
2547
|
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
|
2376
2548
|
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
|
2377
2549
|
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
|
2378
|
-
for (auto& buffer :
|
|
2379
|
-
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.
|
|
2550
|
+
for (auto& buffer : descriptor_buffer_infos) {
|
|
2551
|
+
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
|
|
2380
2552
|
}
|
|
2381
2553
|
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
|
2382
|
-
std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
|
|
2383
|
-
std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
|
|
2384
2554
|
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
|
2385
|
-
GGML_ASSERT(
|
|
2386
|
-
vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
|
|
2387
|
-
for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
|
|
2388
|
-
descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size});
|
|
2389
|
-
}
|
|
2390
|
-
for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
|
|
2391
|
-
write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
|
|
2392
|
-
}
|
|
2555
|
+
GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
|
|
2393
2556
|
|
|
2394
|
-
|
|
2557
|
+
vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
|
|
2558
|
+
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
|
2559
|
+
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
|
|
2395
2560
|
|
|
2396
2561
|
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
|
|
2397
2562
|
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
|
|
@@ -2410,7 +2575,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
|
|
|
2410
2575
|
s.signal_semaphores = std::move(signal_semaphores);
|
|
2411
2576
|
}
|
|
2412
2577
|
|
|
2413
|
-
static void ggml_vk_ctx_end(vk_context
|
|
2578
|
+
static void ggml_vk_ctx_end(vk_context& ctx) {
|
|
2414
2579
|
VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
|
|
2415
2580
|
if (ctx->s == nullptr) {
|
|
2416
2581
|
return;
|
|
@@ -2420,7 +2585,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
|
|
|
2420
2585
|
ctx->s = nullptr;
|
|
2421
2586
|
}
|
|
2422
2587
|
|
|
2423
|
-
static void ggml_vk_ctx_begin(vk_device& device, vk_context
|
|
2588
|
+
static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
|
|
2424
2589
|
VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")");
|
|
2425
2590
|
if (subctx->s != nullptr) {
|
|
2426
2591
|
ggml_vk_ctx_end(subctx);
|
|
@@ -2453,7 +2618,7 @@ static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
|
|
|
2453
2618
|
}
|
|
2454
2619
|
}
|
|
2455
2620
|
|
|
2456
|
-
static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context
|
|
2621
|
+
static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
|
|
2457
2622
|
VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
|
|
2458
2623
|
GGML_ASSERT(!ggml_is_contiguous(tensor));
|
|
2459
2624
|
// Buffer is already mapped
|
|
@@ -2515,23 +2680,15 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|
|
2515
2680
|
return;
|
|
2516
2681
|
}
|
|
2517
2682
|
|
|
2518
|
-
|
|
2519
|
-
|
|
2520
|
-
size_t staging_offset = ctx->staging_offset;
|
|
2521
|
-
const size_t copy_size = ts*ne/bs;
|
|
2522
|
-
if (ctx->staging->size < ctx->staging_offset + copy_size) {
|
|
2523
|
-
if (sync_staging) {
|
|
2524
|
-
// Create temporary larger buffer
|
|
2525
|
-
ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size);
|
|
2526
|
-
|
|
2527
|
-
staging = ctx->device->sync_staging;
|
|
2528
|
-
staging_offset = 0;
|
|
2529
|
-
} else {
|
|
2530
|
-
GGML_ABORT("fatal error");
|
|
2531
|
-
}
|
|
2683
|
+
if (!sync_staging) {
|
|
2684
|
+
GGML_ABORT("Asynchronous write to non-pinned memory not supported");
|
|
2532
2685
|
}
|
|
2533
2686
|
|
|
2534
|
-
|
|
2687
|
+
// Staging buffer required
|
|
2688
|
+
vk_buffer& staging = ctx->device->sync_staging;
|
|
2689
|
+
const uint64_t copy_size = ts*ne/bs;
|
|
2690
|
+
ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size);
|
|
2691
|
+
VkBufferCopy buf_copy{ 0, offset, copy_size };
|
|
2535
2692
|
|
|
2536
2693
|
ggml_vk_sync_buffers(subctx);
|
|
2537
2694
|
vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
|
|
@@ -2540,14 +2697,14 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|
|
2540
2697
|
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
|
2541
2698
|
// Find longest contiguous slice
|
|
2542
2699
|
if (ne1*nb1 == dstnb2) {
|
|
2543
|
-
deferred_memcpy((uint8_t *)staging->ptr +
|
|
2700
|
+
deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys);
|
|
2544
2701
|
} else {
|
|
2545
2702
|
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
|
2546
2703
|
if (ne0*nb0/bs == dstnb1) {
|
|
2547
|
-
deferred_memcpy((uint8_t *)staging->ptr +
|
|
2704
|
+
deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys);
|
|
2548
2705
|
} else {
|
|
2549
2706
|
const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
|
|
2550
|
-
const uint64_t d_off =
|
|
2707
|
+
const uint64_t d_off = i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
|
|
2551
2708
|
for (uint64_t i0 = 0; i0 < ne0; i0++) {
|
|
2552
2709
|
deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys);
|
|
2553
2710
|
}
|
|
@@ -2558,7 +2715,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|
|
2558
2715
|
}
|
|
2559
2716
|
}
|
|
2560
2717
|
|
|
2561
|
-
static void ggml_vk_buffer_write_2d_async(vk_context
|
|
2718
|
+
static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
|
|
2562
2719
|
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
|
|
2563
2720
|
// Buffer is already mapped
|
|
2564
2721
|
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
|
@@ -2593,21 +2750,18 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
|
|
|
2593
2750
|
}
|
|
2594
2751
|
VK_LOG_DEBUG("STAGING");
|
|
2595
2752
|
|
|
2753
|
+
if (!sync_staging) {
|
|
2754
|
+
GGML_ABORT("Asynchronous write to non-pinned memory not supported");
|
|
2755
|
+
}
|
|
2756
|
+
|
|
2596
2757
|
// Staging buffer required
|
|
2597
2758
|
const size_t copy_size = width*height;
|
|
2598
|
-
|
|
2599
|
-
if (sync_staging) {
|
|
2600
|
-
ggml_vk_ensure_sync_staging_buffer(dst->device, copy_size);
|
|
2759
|
+
ggml_vk_ensure_sync_staging_buffer(dst->device, copy_size);
|
|
2601
2760
|
|
|
2602
|
-
|
|
2603
|
-
staging_offset = 0;
|
|
2604
|
-
} else {
|
|
2605
|
-
GGML_ABORT("fatal error");
|
|
2606
|
-
}
|
|
2607
|
-
}
|
|
2761
|
+
vk_buffer& staging_buffer = dst->device->sync_staging;
|
|
2608
2762
|
|
|
2609
2763
|
VkBufferCopy buf_copy = {
|
|
2610
|
-
|
|
2764
|
+
0,
|
|
2611
2765
|
offset,
|
|
2612
2766
|
copy_size};
|
|
2613
2767
|
|
|
@@ -2615,17 +2769,17 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
|
|
|
2615
2769
|
vkCmdCopyBuffer(subctx->s->buffer, staging_buffer->buffer, dst->buffer, 1, &buf_copy);
|
|
2616
2770
|
|
|
2617
2771
|
if (width == spitch) {
|
|
2618
|
-
deferred_memcpy((uint8_t *)staging_buffer->ptr
|
|
2772
|
+
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
|
|
2619
2773
|
} else {
|
|
2620
2774
|
for (size_t i = 0; i < height; i++) {
|
|
2621
|
-
deferred_memcpy((uint8_t *)staging_buffer->ptr +
|
|
2775
|
+
deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
|
|
2622
2776
|
}
|
|
2623
2777
|
}
|
|
2624
2778
|
}
|
|
2625
2779
|
|
|
2626
|
-
static void ggml_vk_buffer_write_async(vk_context
|
|
2780
|
+
static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
|
|
2627
2781
|
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
|
|
2628
|
-
return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1,
|
|
2782
|
+
return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, sync_staging);
|
|
2629
2783
|
}
|
|
2630
2784
|
|
|
2631
2785
|
static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
|
|
@@ -2638,9 +2792,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
|
|
2638
2792
|
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
|
|
2639
2793
|
}
|
|
2640
2794
|
} else {
|
|
2641
|
-
vk_context
|
|
2795
|
+
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
|
|
2642
2796
|
ggml_vk_ctx_begin(dst->device, subctx);
|
|
2643
|
-
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height,
|
|
2797
|
+
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
|
|
2644
2798
|
ggml_vk_ctx_end(subctx);
|
|
2645
2799
|
|
|
2646
2800
|
for (auto& cpy : subctx->in_memcpys) {
|
|
@@ -2650,8 +2804,6 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
|
|
2650
2804
|
ggml_vk_submit(subctx, dst->device->fence);
|
|
2651
2805
|
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
|
2652
2806
|
dst->device->device.resetFences({ dst->device->fence });
|
|
2653
|
-
|
|
2654
|
-
delete subctx;
|
|
2655
2807
|
}
|
|
2656
2808
|
}
|
|
2657
2809
|
|
|
@@ -2660,12 +2812,14 @@ static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src
|
|
|
2660
2812
|
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1);
|
|
2661
2813
|
}
|
|
2662
2814
|
|
|
2663
|
-
static void ggml_vk_buffer_read_2d_async(vk_context
|
|
2815
|
+
static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
|
|
2664
2816
|
VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
|
|
2665
2817
|
GGML_ASSERT(width > 0);
|
|
2666
2818
|
GGML_ASSERT(height > 0);
|
|
2667
2819
|
GGML_ASSERT(src != nullptr);
|
|
2668
2820
|
|
|
2821
|
+
// TODO: staging_offset is not used
|
|
2822
|
+
|
|
2669
2823
|
// Check if dst is pinned memory
|
|
2670
2824
|
vk_buffer buf = nullptr;
|
|
2671
2825
|
size_t buf_offset;
|
|
@@ -2695,18 +2849,15 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
|
|
|
2695
2849
|
}
|
|
2696
2850
|
VK_LOG_DEBUG("STAGING");
|
|
2697
2851
|
|
|
2852
|
+
if (!sync_staging) {
|
|
2853
|
+
GGML_ABORT("Asynchronous read from non-pinned memory not supported");
|
|
2854
|
+
}
|
|
2855
|
+
|
|
2698
2856
|
// Fall back to staging buffer
|
|
2699
2857
|
const size_t copy_size = dpitch * height;
|
|
2700
|
-
|
|
2701
|
-
if (sync_staging) {
|
|
2702
|
-
// Create temporary larger buffer
|
|
2703
|
-
ggml_vk_ensure_sync_staging_buffer(src->device, copy_size);
|
|
2858
|
+
ggml_vk_ensure_sync_staging_buffer(src->device, copy_size);
|
|
2704
2859
|
|
|
2705
|
-
|
|
2706
|
-
} else {
|
|
2707
|
-
GGML_ABORT("fatal error");
|
|
2708
|
-
}
|
|
2709
|
-
}
|
|
2860
|
+
vk_buffer& staging_buffer = src->device->sync_staging;
|
|
2710
2861
|
|
|
2711
2862
|
ggml_vk_sync_buffers(subctx);
|
|
2712
2863
|
subctx->s->buffer.copyBuffer(src->buffer, staging_buffer->buffer, slices);
|
|
@@ -2714,20 +2865,24 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
|
|
|
2714
2865
|
deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
|
|
2715
2866
|
}
|
|
2716
2867
|
|
|
2717
|
-
static void ggml_vk_buffer_read_async(vk_context
|
|
2718
|
-
return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1,
|
|
2868
|
+
static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) {
|
|
2869
|
+
return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, sync_staging);
|
|
2719
2870
|
}
|
|
2720
2871
|
|
|
2721
2872
|
static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
|
|
2722
|
-
VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
|
|
2723
|
-
|
|
2873
|
+
VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
|
|
2874
|
+
|
|
2875
|
+
// If the device is not an UMA device the memory is host-accessible through rebar. While writing
|
|
2876
|
+
// through PCIe is sufficient fast reading back data from PCIe is slower than going through
|
|
2877
|
+
// the HW device to host copy path.
|
|
2878
|
+
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
|
|
2724
2879
|
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
2725
2880
|
|
|
2726
2881
|
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
|
2727
2882
|
} else {
|
|
2728
|
-
vk_context
|
|
2883
|
+
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
|
|
2729
2884
|
ggml_vk_ctx_begin(src->device, subctx);
|
|
2730
|
-
ggml_vk_buffer_read_async(subctx, src, offset, dst, size,
|
|
2885
|
+
ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
|
|
2731
2886
|
ggml_vk_ctx_end(subctx);
|
|
2732
2887
|
|
|
2733
2888
|
ggml_vk_submit(subctx, src->device->fence);
|
|
@@ -2737,12 +2892,10 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
|
|
2737
2892
|
for (auto& cpy : subctx->out_memcpys) {
|
|
2738
2893
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
|
2739
2894
|
}
|
|
2740
|
-
|
|
2741
|
-
delete subctx;
|
|
2742
2895
|
}
|
|
2743
2896
|
}
|
|
2744
2897
|
|
|
2745
|
-
static void ggml_vk_buffer_copy_async(vk_context
|
|
2898
|
+
static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
|
2746
2899
|
VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
|
|
2747
2900
|
// Make sure both buffers are on same device
|
|
2748
2901
|
GGML_ASSERT(src->device == dst->device);
|
|
@@ -2756,15 +2909,13 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
|
2756
2909
|
if (src->device == dst->device) {
|
|
2757
2910
|
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
|
2758
2911
|
// Copy within the device
|
|
2759
|
-
vk_context
|
|
2912
|
+
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
|
|
2760
2913
|
ggml_vk_ctx_begin(src->device, subctx);
|
|
2761
2914
|
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
|
|
2762
2915
|
ggml_vk_ctx_end(subctx);
|
|
2763
2916
|
ggml_vk_submit(subctx, src->device->fence);
|
|
2764
2917
|
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
|
2765
2918
|
src->device->device.resetFences({ src->device->fence });
|
|
2766
|
-
|
|
2767
|
-
delete subctx;
|
|
2768
2919
|
} else {
|
|
2769
2920
|
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
|
2770
2921
|
// Copy device to device
|
|
@@ -2783,7 +2934,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
|
2783
2934
|
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
|
2784
2935
|
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
|
2785
2936
|
|
|
2786
|
-
vk_context
|
|
2937
|
+
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
|
|
2787
2938
|
ggml_vk_ctx_begin(dst->device, subctx);
|
|
2788
2939
|
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
|
|
2789
2940
|
ggml_vk_ctx_end(subctx);
|
|
@@ -2791,8 +2942,6 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
|
|
2791
2942
|
ggml_vk_submit(subctx, dst->device->fence);
|
|
2792
2943
|
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
|
|
2793
2944
|
dst->device->device.resetFences({ dst->device->fence });
|
|
2794
|
-
|
|
2795
|
-
delete subctx;
|
|
2796
2945
|
}
|
|
2797
2946
|
|
|
2798
2947
|
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
|
|
@@ -2855,7 +3004,7 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
|
|
|
2855
3004
|
}
|
|
2856
3005
|
|
|
2857
3006
|
static void ggml_vk_matmul(
|
|
2858
|
-
ggml_backend_vk_context * ctx, vk_context
|
|
3007
|
+
ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline,
|
|
2859
3008
|
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer,
|
|
2860
3009
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
|
2861
3010
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
|
@@ -2879,7 +3028,7 @@ static void ggml_vk_matmul(
|
|
|
2879
3028
|
}
|
|
2880
3029
|
|
|
2881
3030
|
static void ggml_vk_matmul_id(
|
|
2882
|
-
ggml_backend_vk_context * ctx, vk_context
|
|
3031
|
+
ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline,
|
|
2883
3032
|
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids,
|
|
2884
3033
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
|
2885
3034
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
|
@@ -2916,7 +3065,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
|
|
|
2916
3065
|
GGML_ABORT("fatal error");
|
|
2917
3066
|
}
|
|
2918
3067
|
|
|
2919
|
-
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
|
|
3068
|
+
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
|
2920
3069
|
VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
|
2921
3070
|
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
|
|
2922
3071
|
const int tensor_type_size = ggml_type_size(tensor->type);
|
|
@@ -2934,10 +3083,11 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
|
|
|
2934
3083
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
|
|
2935
3084
|
}
|
|
2936
3085
|
|
|
2937
|
-
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3086
|
+
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
2938
3087
|
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
2939
3088
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
2940
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]
|
|
3089
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
|
3090
|
+
std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
|
|
2941
3091
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
|
2942
3092
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
2943
3093
|
|
|
@@ -2957,9 +3107,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2957
3107
|
const uint64_t r2 = ne12 / ne02;
|
|
2958
3108
|
const uint64_t r3 = ne13 / ne03;
|
|
2959
3109
|
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
|
|
3110
|
+
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
3111
|
+
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
|
3112
|
+
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
|
2963
3113
|
|
|
2964
3114
|
vk_buffer d_Qx;
|
|
2965
3115
|
size_t qx_buf_offset = 0;
|
|
@@ -3011,8 +3161,58 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
3011
3161
|
const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
|
3012
3162
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3013
3163
|
|
|
3014
|
-
|
|
3015
|
-
|
|
3164
|
+
vk_pipeline to_fp16_vk_0 = nullptr;
|
|
3165
|
+
vk_pipeline to_fp16_vk_1 = nullptr;
|
|
3166
|
+
|
|
3167
|
+
if (x_non_contig) {
|
|
3168
|
+
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
|
|
3169
|
+
} else {
|
|
3170
|
+
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
|
3171
|
+
}
|
|
3172
|
+
if (y_non_contig) {
|
|
3173
|
+
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
|
|
3174
|
+
} else {
|
|
3175
|
+
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
|
3176
|
+
}
|
|
3177
|
+
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
|
3178
|
+
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
|
3179
|
+
|
|
3180
|
+
if (dryrun) {
|
|
3181
|
+
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
|
|
3182
|
+
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
|
3183
|
+
const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * 4 : 0;
|
|
3184
|
+
if (
|
|
3185
|
+
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
|
|
3186
|
+
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
|
|
3187
|
+
(split_k > 1 && split_k_size > ctx->device->max_memory_allocation_size)) {
|
|
3188
|
+
GGML_ABORT("Requested preallocation size is too large");
|
|
3189
|
+
}
|
|
3190
|
+
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
|
|
3191
|
+
ctx->prealloc_size_x = x_sz_upd;
|
|
3192
|
+
}
|
|
3193
|
+
if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
|
|
3194
|
+
ctx->prealloc_size_y = y_sz_upd;
|
|
3195
|
+
}
|
|
3196
|
+
if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) {
|
|
3197
|
+
ctx->prealloc_size_split_k = split_k_size;
|
|
3198
|
+
}
|
|
3199
|
+
|
|
3200
|
+
// Request descriptor sets
|
|
3201
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
|
3202
|
+
if (qx_needs_dequant) {
|
|
3203
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
|
3204
|
+
}
|
|
3205
|
+
if (qy_needs_dequant) {
|
|
3206
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
|
3207
|
+
}
|
|
3208
|
+
if (split_k > 1) {
|
|
3209
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
|
3210
|
+
}
|
|
3211
|
+
return;
|
|
3212
|
+
}
|
|
3213
|
+
|
|
3214
|
+
vk_buffer d_D = dst_buf_ctx->dev_buffer;
|
|
3215
|
+
const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
|
3016
3216
|
GGML_ASSERT(d_D != nullptr);
|
|
3017
3217
|
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
|
3018
3218
|
vk_buffer d_X;
|
|
@@ -3020,13 +3220,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
3020
3220
|
vk_buffer d_Y;
|
|
3021
3221
|
uint64_t y_buf_offset = 0;
|
|
3022
3222
|
if (!src0_uma) {
|
|
3023
|
-
d_Qx =
|
|
3024
|
-
qx_buf_offset =
|
|
3223
|
+
d_Qx = src0_buf_ctx->dev_buffer;
|
|
3224
|
+
qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
|
3025
3225
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3026
3226
|
}
|
|
3027
3227
|
if (!src1_uma) {
|
|
3028
|
-
d_Qy =
|
|
3029
|
-
qy_buf_offset =
|
|
3228
|
+
d_Qy = src1_buf_ctx->dev_buffer;
|
|
3229
|
+
qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
|
|
3030
3230
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3031
3231
|
}
|
|
3032
3232
|
if (qx_needs_dequant) {
|
|
@@ -3046,40 +3246,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
3046
3246
|
GGML_ASSERT(qy_sz == y_sz);
|
|
3047
3247
|
}
|
|
3048
3248
|
|
|
3049
|
-
vk_pipeline to_fp16_vk_0 = nullptr;
|
|
3050
|
-
vk_pipeline to_fp16_vk_1 = nullptr;
|
|
3051
|
-
|
|
3052
|
-
if (x_non_contig) {
|
|
3053
|
-
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
|
|
3054
|
-
} else {
|
|
3055
|
-
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
|
3056
|
-
}
|
|
3057
|
-
if (y_non_contig) {
|
|
3058
|
-
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
|
|
3059
|
-
} else {
|
|
3060
|
-
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
|
3061
|
-
}
|
|
3062
|
-
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
|
3063
|
-
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
|
3064
|
-
|
|
3065
|
-
// Allocate descriptor sets
|
|
3066
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1);
|
|
3067
|
-
if (qx_needs_dequant) {
|
|
3068
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
|
3069
|
-
}
|
|
3070
|
-
if (qy_needs_dequant) {
|
|
3071
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
|
3072
|
-
}
|
|
3073
|
-
if (split_k > 1) {
|
|
3074
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
|
3075
|
-
}
|
|
3076
|
-
|
|
3077
3249
|
if (x_non_contig) {
|
|
3078
3250
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
3079
3251
|
} else if (qx_needs_dequant) {
|
|
3080
3252
|
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
|
3081
3253
|
ggml_vk_sync_buffers(subctx);
|
|
3082
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
3254
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
3083
3255
|
}
|
|
3084
3256
|
if (y_non_contig) {
|
|
3085
3257
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
@@ -3107,10 +3279,11 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
3107
3279
|
); // NOLINT
|
|
3108
3280
|
}
|
|
3109
3281
|
|
|
3110
|
-
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3282
|
+
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
3111
3283
|
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3112
3284
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3113
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]
|
|
3285
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
|
3286
|
+
std::cerr << "), " << (dryrun ? "dryrun" : "") << "),)");
|
|
3114
3287
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
|
3115
3288
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
3116
3289
|
|
|
@@ -3134,9 +3307,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3134
3307
|
const uint64_t r2 = ne12 / ne02;
|
|
3135
3308
|
const uint64_t r3 = ne13 / ne03;
|
|
3136
3309
|
|
|
3137
|
-
|
|
3138
|
-
|
|
3139
|
-
|
|
3310
|
+
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
3311
|
+
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
|
3312
|
+
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
|
3140
3313
|
|
|
3141
3314
|
vk_buffer d_Qx;
|
|
3142
3315
|
size_t qx_buf_offset = 0;
|
|
@@ -3174,21 +3347,62 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3174
3347
|
const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
|
3175
3348
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3176
3349
|
|
|
3177
|
-
|
|
3178
|
-
|
|
3350
|
+
vk_pipeline to_fp16_vk_0 = nullptr;
|
|
3351
|
+
vk_pipeline to_fp16_vk_1 = nullptr;
|
|
3352
|
+
if (x_non_contig) {
|
|
3353
|
+
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
|
|
3354
|
+
}
|
|
3355
|
+
if (y_non_contig) {
|
|
3356
|
+
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
|
|
3357
|
+
} else {
|
|
3358
|
+
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
|
3359
|
+
}
|
|
3360
|
+
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type);
|
|
3361
|
+
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
|
3362
|
+
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
|
3363
|
+
GGML_ASSERT(dmmv != nullptr);
|
|
3364
|
+
|
|
3365
|
+
if (dryrun) {
|
|
3366
|
+
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
|
|
3367
|
+
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
|
3368
|
+
if (
|
|
3369
|
+
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
|
|
3370
|
+
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
|
|
3371
|
+
GGML_ABORT("Requested preallocation size is too large");
|
|
3372
|
+
}
|
|
3373
|
+
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
|
|
3374
|
+
ctx->prealloc_size_x = x_sz_upd;
|
|
3375
|
+
}
|
|
3376
|
+
if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
|
|
3377
|
+
ctx->prealloc_size_y = y_sz_upd;
|
|
3378
|
+
}
|
|
3379
|
+
|
|
3380
|
+
// Request descriptor sets
|
|
3381
|
+
if (qx_needs_dequant) {
|
|
3382
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
|
3383
|
+
}
|
|
3384
|
+
if (qy_needs_dequant) {
|
|
3385
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
|
3386
|
+
}
|
|
3387
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
|
|
3388
|
+
return;
|
|
3389
|
+
}
|
|
3390
|
+
|
|
3391
|
+
vk_buffer d_D = dst_buf_ctx->dev_buffer;
|
|
3392
|
+
const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
|
3179
3393
|
GGML_ASSERT(d_D != nullptr);
|
|
3180
3394
|
vk_buffer d_X;
|
|
3181
3395
|
uint64_t x_buf_offset = 0;
|
|
3182
3396
|
vk_buffer d_Y;
|
|
3183
3397
|
uint64_t y_buf_offset = 0;
|
|
3184
3398
|
if(!src0_uma) {
|
|
3185
|
-
d_Qx =
|
|
3186
|
-
qx_buf_offset =
|
|
3399
|
+
d_Qx = src0_buf_ctx->dev_buffer;
|
|
3400
|
+
qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
|
3187
3401
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3188
3402
|
}
|
|
3189
3403
|
if(!src1_uma) {
|
|
3190
|
-
d_Qy =
|
|
3191
|
-
qy_buf_offset =
|
|
3404
|
+
d_Qy = src1_buf_ctx->dev_buffer;
|
|
3405
|
+
qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
|
|
3192
3406
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3193
3407
|
}
|
|
3194
3408
|
if (qx_needs_dequant) {
|
|
@@ -3206,30 +3420,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3206
3420
|
GGML_ASSERT(qy_sz == y_sz);
|
|
3207
3421
|
}
|
|
3208
3422
|
|
|
3209
|
-
vk_pipeline to_fp16_vk_0 = nullptr;
|
|
3210
|
-
vk_pipeline to_fp16_vk_1 = nullptr;
|
|
3211
|
-
if (x_non_contig) {
|
|
3212
|
-
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
|
|
3213
|
-
}
|
|
3214
|
-
if (y_non_contig) {
|
|
3215
|
-
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
|
|
3216
|
-
} else {
|
|
3217
|
-
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
|
3218
|
-
}
|
|
3219
|
-
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type);
|
|
3220
|
-
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
|
3221
|
-
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
|
3222
|
-
GGML_ASSERT(dmmv != nullptr);
|
|
3223
|
-
|
|
3224
|
-
// Allocate descriptor sets
|
|
3225
|
-
if (qx_needs_dequant) {
|
|
3226
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
|
3227
|
-
}
|
|
3228
|
-
if (qy_needs_dequant) {
|
|
3229
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
|
|
3230
|
-
}
|
|
3231
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, dmmv, ne12 * ne13);
|
|
3232
|
-
|
|
3233
3423
|
if (x_non_contig) {
|
|
3234
3424
|
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
|
3235
3425
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
@@ -3268,14 +3458,15 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3268
3458
|
};
|
|
3269
3459
|
ggml_vk_sync_buffers(subctx);
|
|
3270
3460
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
3271
|
-
{ { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} },
|
|
3461
|
+
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
|
|
3272
3462
|
sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
|
|
3273
3463
|
}
|
|
3274
3464
|
|
|
3275
|
-
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context
|
|
3465
|
+
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
3276
3466
|
VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3277
3467
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3278
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]
|
|
3468
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
|
3469
|
+
std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
|
|
3279
3470
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
|
3280
3471
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
|
3281
3472
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
|
@@ -3294,9 +3485,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
3294
3485
|
|
|
3295
3486
|
GGML_ASSERT(ne11 == 1);
|
|
3296
3487
|
|
|
3297
|
-
|
|
3298
|
-
|
|
3299
|
-
|
|
3488
|
+
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
3489
|
+
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
|
3490
|
+
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
|
3300
3491
|
|
|
3301
3492
|
vk_buffer d_Qy;
|
|
3302
3493
|
size_t qy_buf_offset = 0;
|
|
@@ -3316,21 +3507,24 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
3316
3507
|
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
|
3317
3508
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3318
3509
|
|
|
3319
|
-
|
|
3320
|
-
|
|
3510
|
+
if (dryrun) {
|
|
3511
|
+
// Request descriptor sets
|
|
3512
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1);
|
|
3513
|
+
return;
|
|
3514
|
+
}
|
|
3515
|
+
|
|
3516
|
+
vk_buffer d_D = dst_buf_ctx->dev_buffer;
|
|
3517
|
+
const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
|
3321
3518
|
GGML_ASSERT(d_D != nullptr);
|
|
3322
|
-
vk_buffer d_Qx =
|
|
3323
|
-
const uint64_t qx_buf_offset =
|
|
3519
|
+
vk_buffer d_Qx = src0_buf_ctx->dev_buffer;
|
|
3520
|
+
const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
|
3324
3521
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3325
3522
|
if (!src1_uma) {
|
|
3326
|
-
d_Qy =
|
|
3327
|
-
qy_buf_offset =
|
|
3523
|
+
d_Qy = src1_buf_ctx->dev_buffer;
|
|
3524
|
+
qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
|
|
3328
3525
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3329
3526
|
}
|
|
3330
3527
|
|
|
3331
|
-
// Allocate descriptor sets
|
|
3332
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1);
|
|
3333
|
-
|
|
3334
3528
|
const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
|
3335
3529
|
const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
|
|
3336
3530
|
|
|
@@ -3340,13 +3534,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
3340
3534
|
// compute
|
|
3341
3535
|
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
|
3342
3536
|
ggml_vk_sync_buffers(subctx);
|
|
3343
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
|
3537
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
|
3344
3538
|
}
|
|
3345
3539
|
|
|
3346
|
-
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context
|
|
3540
|
+
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
3347
3541
|
VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3348
3542
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3349
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]
|
|
3543
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
|
3544
|
+
std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
|
|
3350
3545
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
|
3351
3546
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
3352
3547
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
|
@@ -3368,9 +3563,9 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
3368
3563
|
|
|
3369
3564
|
GGML_ASSERT(ne11 == 1);
|
|
3370
3565
|
|
|
3371
|
-
|
|
3372
|
-
|
|
3373
|
-
|
|
3566
|
+
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
3567
|
+
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
|
3568
|
+
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
|
3374
3569
|
|
|
3375
3570
|
vk_buffer d_Qy = nullptr;
|
|
3376
3571
|
size_t qy_buf_offset = 0;
|
|
@@ -3391,21 +3586,24 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
3391
3586
|
const uint64_t qy_sz = ggml_nbytes(src1);
|
|
3392
3587
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3393
3588
|
|
|
3394
|
-
|
|
3395
|
-
|
|
3396
|
-
|
|
3397
|
-
|
|
3398
|
-
|
|
3589
|
+
if (dryrun) {
|
|
3590
|
+
// Request descriptor sets
|
|
3591
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
|
3592
|
+
return;
|
|
3593
|
+
}
|
|
3594
|
+
|
|
3595
|
+
vk_buffer d_D = dst_buf_ctx->dev_buffer;
|
|
3596
|
+
const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
|
3597
|
+
GGML_ASSERT(d_D != nullptr);
|
|
3598
|
+
vk_buffer d_Qx = src0_buf_ctx->dev_buffer;
|
|
3599
|
+
const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
|
3399
3600
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3400
3601
|
if (!src1_uma) {
|
|
3401
|
-
d_Qy =
|
|
3402
|
-
qy_buf_offset =
|
|
3602
|
+
d_Qy = src1_buf_ctx->dev_buffer;
|
|
3603
|
+
qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
|
|
3403
3604
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3404
3605
|
}
|
|
3405
3606
|
|
|
3406
|
-
// Allocate descriptor sets
|
|
3407
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
|
3408
|
-
|
|
3409
3607
|
const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
|
3410
3608
|
const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
|
|
3411
3609
|
|
|
@@ -3415,23 +3613,24 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
3415
3613
|
// compute
|
|
3416
3614
|
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
|
3417
3615
|
ggml_vk_sync_buffers(subctx);
|
|
3418
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
|
3616
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
|
3617
|
+
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
|
3419
3618
|
}
|
|
3420
3619
|
|
|
3421
|
-
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context
|
|
3620
|
+
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
3422
3621
|
VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
|
|
3423
3622
|
if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
|
|
3424
|
-
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
|
|
3623
|
+
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
|
|
3425
3624
|
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
|
|
3426
|
-
ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst);
|
|
3625
|
+
ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
|
|
3427
3626
|
} else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
|
3428
|
-
ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst);
|
|
3627
|
+
ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun);
|
|
3429
3628
|
} else {
|
|
3430
|
-
ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst);
|
|
3629
|
+
ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, dryrun);
|
|
3431
3630
|
}
|
|
3432
3631
|
}
|
|
3433
3632
|
|
|
3434
|
-
static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3633
|
+
static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) {
|
|
3435
3634
|
VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3436
3635
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3437
3636
|
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
|
@@ -3463,10 +3662,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
|
3463
3662
|
|
|
3464
3663
|
const uint64_t n_as = ne02;
|
|
3465
3664
|
|
|
3466
|
-
|
|
3467
|
-
|
|
3468
|
-
|
|
3469
|
-
|
|
3665
|
+
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
3666
|
+
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
|
3667
|
+
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
|
3668
|
+
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
|
3470
3669
|
|
|
3471
3670
|
vk_buffer d_Qx;
|
|
3472
3671
|
size_t qx_buf_offset = 0;
|
|
@@ -3521,26 +3720,68 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
|
3521
3720
|
const uint64_t ids_sz = nbi2;
|
|
3522
3721
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3523
3722
|
|
|
3524
|
-
|
|
3525
|
-
|
|
3723
|
+
vk_pipeline to_fp16_vk_0 = nullptr;
|
|
3724
|
+
vk_pipeline to_fp16_vk_1 = nullptr;
|
|
3725
|
+
|
|
3726
|
+
if (x_non_contig) {
|
|
3727
|
+
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
|
|
3728
|
+
} else {
|
|
3729
|
+
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
|
3730
|
+
}
|
|
3731
|
+
if (y_non_contig) {
|
|
3732
|
+
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
|
|
3733
|
+
} else {
|
|
3734
|
+
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
|
3735
|
+
}
|
|
3736
|
+
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
|
3737
|
+
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
|
3738
|
+
|
|
3739
|
+
if (dryrun) {
|
|
3740
|
+
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
|
|
3741
|
+
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
|
3742
|
+
if (
|
|
3743
|
+
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
|
|
3744
|
+
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
|
|
3745
|
+
GGML_ABORT("Requested preallocation size is too large");
|
|
3746
|
+
}
|
|
3747
|
+
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
|
|
3748
|
+
ctx->prealloc_size_x = x_sz_upd;
|
|
3749
|
+
}
|
|
3750
|
+
if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
|
|
3751
|
+
ctx->prealloc_size_y = y_sz_upd;
|
|
3752
|
+
}
|
|
3753
|
+
|
|
3754
|
+
// Request descriptor sets
|
|
3755
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
|
3756
|
+
if (qx_needs_dequant) {
|
|
3757
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
|
3758
|
+
}
|
|
3759
|
+
if (qy_needs_dequant) {
|
|
3760
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
|
3761
|
+
}
|
|
3762
|
+
return;
|
|
3763
|
+
}
|
|
3764
|
+
|
|
3765
|
+
vk_buffer d_D = dst_buf_ctx->dev_buffer;
|
|
3766
|
+
const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
|
3526
3767
|
GGML_ASSERT(d_D != nullptr);
|
|
3527
3768
|
vk_buffer d_X;
|
|
3528
3769
|
uint64_t x_buf_offset = 0;
|
|
3529
3770
|
vk_buffer d_Y;
|
|
3530
3771
|
uint64_t y_buf_offset = 0;
|
|
3531
3772
|
if (!src0_uma) {
|
|
3532
|
-
d_Qx =
|
|
3533
|
-
qx_buf_offset =
|
|
3773
|
+
d_Qx = src0_buf_ctx->dev_buffer;
|
|
3774
|
+
qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
|
3534
3775
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3535
3776
|
}
|
|
3536
3777
|
if (!src1_uma) {
|
|
3537
|
-
d_Qy =
|
|
3538
|
-
qy_buf_offset =
|
|
3778
|
+
d_Qy = src1_buf_ctx->dev_buffer;
|
|
3779
|
+
qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
|
|
3539
3780
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3540
3781
|
}
|
|
3541
3782
|
if (!ids_uma) {
|
|
3542
|
-
d_ids =
|
|
3543
|
-
ids_buf_offset =
|
|
3783
|
+
d_ids = ids_buf_ctx->dev_buffer;
|
|
3784
|
+
ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
|
|
3544
3785
|
GGML_ASSERT(d_ids != nullptr);
|
|
3545
3786
|
}
|
|
3546
3787
|
if (qx_needs_dequant) {
|
|
@@ -3560,37 +3801,13 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
|
3560
3801
|
GGML_ASSERT(qy_sz == y_sz);
|
|
3561
3802
|
}
|
|
3562
3803
|
|
|
3563
|
-
vk_pipeline to_fp16_vk_0 = nullptr;
|
|
3564
|
-
vk_pipeline to_fp16_vk_1 = nullptr;
|
|
3565
|
-
|
|
3566
|
-
if (x_non_contig) {
|
|
3567
|
-
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
|
|
3568
|
-
} else {
|
|
3569
|
-
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
|
|
3570
|
-
}
|
|
3571
|
-
if (y_non_contig) {
|
|
3572
|
-
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
|
|
3573
|
-
} else {
|
|
3574
|
-
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
|
3575
|
-
}
|
|
3576
|
-
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
|
3577
|
-
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
|
3578
|
-
|
|
3579
|
-
// Allocate descriptor sets
|
|
3580
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1);
|
|
3581
|
-
if (qx_needs_dequant) {
|
|
3582
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
|
3583
|
-
}
|
|
3584
|
-
if (qy_needs_dequant) {
|
|
3585
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
|
3586
|
-
}
|
|
3587
|
-
|
|
3588
3804
|
if (x_non_contig) {
|
|
3589
3805
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
3590
3806
|
} else if (qx_needs_dequant) {
|
|
3591
3807
|
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
|
3592
3808
|
ggml_vk_sync_buffers(subctx);
|
|
3593
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
|
|
3809
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
|
|
3810
|
+
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
3594
3811
|
}
|
|
3595
3812
|
if (y_non_contig) {
|
|
3596
3813
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
@@ -3618,11 +3835,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
|
3618
3835
|
); // NOLINT
|
|
3619
3836
|
}
|
|
3620
3837
|
|
|
3621
|
-
static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3838
|
+
static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) {
|
|
3622
3839
|
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3623
3840
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3624
3841
|
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
|
3625
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]
|
|
3842
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
|
3843
|
+
std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
|
|
3626
3844
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
|
3627
3845
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
3628
3846
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
|
@@ -3649,10 +3867,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3649
3867
|
const uint64_t ne22 = dst->ne[2];
|
|
3650
3868
|
const uint64_t ne23 = dst->ne[3];
|
|
3651
3869
|
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
3655
|
-
|
|
3870
|
+
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
3871
|
+
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
|
3872
|
+
ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
|
3873
|
+
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
|
3656
3874
|
|
|
3657
3875
|
vk_buffer d_Qx;
|
|
3658
3876
|
size_t qx_buf_offset = 0;
|
|
@@ -3696,26 +3914,67 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3696
3914
|
const uint64_t ids_sz = nbi2;
|
|
3697
3915
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3698
3916
|
|
|
3699
|
-
|
|
3700
|
-
|
|
3917
|
+
vk_pipeline to_fp16_vk_0 = nullptr;
|
|
3918
|
+
vk_pipeline to_fp16_vk_1 = nullptr;
|
|
3919
|
+
if (x_non_contig) {
|
|
3920
|
+
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
|
|
3921
|
+
}
|
|
3922
|
+
if (y_non_contig) {
|
|
3923
|
+
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
|
|
3924
|
+
} else {
|
|
3925
|
+
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
|
3926
|
+
}
|
|
3927
|
+
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type);
|
|
3928
|
+
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
|
3929
|
+
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
|
3930
|
+
GGML_ASSERT(dmmv != nullptr);
|
|
3931
|
+
|
|
3932
|
+
if (dryrun) {
|
|
3933
|
+
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
|
|
3934
|
+
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
|
3935
|
+
if (
|
|
3936
|
+
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
|
|
3937
|
+
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
|
|
3938
|
+
GGML_ABORT("Requested preallocation size is too large");
|
|
3939
|
+
}
|
|
3940
|
+
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
|
|
3941
|
+
ctx->prealloc_size_x = x_sz_upd;
|
|
3942
|
+
}
|
|
3943
|
+
if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
|
|
3944
|
+
ctx->prealloc_size_y = y_sz_upd;
|
|
3945
|
+
}
|
|
3946
|
+
|
|
3947
|
+
// Request descriptor sets
|
|
3948
|
+
if (qx_needs_dequant) {
|
|
3949
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
|
3950
|
+
}
|
|
3951
|
+
if (qy_needs_dequant) {
|
|
3952
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
|
3953
|
+
}
|
|
3954
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
|
|
3955
|
+
return;
|
|
3956
|
+
}
|
|
3957
|
+
|
|
3958
|
+
vk_buffer d_D = dst_buf_ctx->dev_buffer;
|
|
3959
|
+
const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
|
3701
3960
|
GGML_ASSERT(d_D != nullptr);
|
|
3702
3961
|
vk_buffer d_X;
|
|
3703
3962
|
uint64_t x_buf_offset = 0;
|
|
3704
3963
|
vk_buffer d_Y;
|
|
3705
3964
|
uint64_t y_buf_offset = 0;
|
|
3706
3965
|
if(!src0_uma) {
|
|
3707
|
-
d_Qx =
|
|
3708
|
-
qx_buf_offset =
|
|
3966
|
+
d_Qx = src0_buf_ctx->dev_buffer;
|
|
3967
|
+
qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
|
3709
3968
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3710
3969
|
}
|
|
3711
3970
|
if(!src1_uma) {
|
|
3712
|
-
d_Qy =
|
|
3713
|
-
qy_buf_offset =
|
|
3971
|
+
d_Qy = src1_buf_ctx->dev_buffer;
|
|
3972
|
+
qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
|
|
3714
3973
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3715
3974
|
}
|
|
3716
3975
|
if(!ids_uma) {
|
|
3717
|
-
d_ids =
|
|
3718
|
-
ids_buf_offset =
|
|
3976
|
+
d_ids = ids_buf_ctx->dev_buffer;
|
|
3977
|
+
ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
|
|
3719
3978
|
GGML_ASSERT(d_ids != nullptr);
|
|
3720
3979
|
}
|
|
3721
3980
|
if (qx_needs_dequant) {
|
|
@@ -3733,30 +3992,6 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3733
3992
|
GGML_ASSERT(qy_sz == y_sz);
|
|
3734
3993
|
}
|
|
3735
3994
|
|
|
3736
|
-
vk_pipeline to_fp16_vk_0 = nullptr;
|
|
3737
|
-
vk_pipeline to_fp16_vk_1 = nullptr;
|
|
3738
|
-
if (x_non_contig) {
|
|
3739
|
-
to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
|
|
3740
|
-
}
|
|
3741
|
-
if (y_non_contig) {
|
|
3742
|
-
to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
|
|
3743
|
-
} else {
|
|
3744
|
-
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
|
3745
|
-
}
|
|
3746
|
-
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type);
|
|
3747
|
-
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
|
3748
|
-
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
|
3749
|
-
GGML_ASSERT(dmmv != nullptr);
|
|
3750
|
-
|
|
3751
|
-
// Allocate descriptor sets
|
|
3752
|
-
if (qx_needs_dequant) {
|
|
3753
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
|
3754
|
-
}
|
|
3755
|
-
if (qy_needs_dequant) {
|
|
3756
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
|
|
3757
|
-
}
|
|
3758
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, dmmv, ne12 * ne13);
|
|
3759
|
-
|
|
3760
3995
|
if (x_non_contig) {
|
|
3761
3996
|
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
|
3762
3997
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
@@ -3790,95 +4025,22 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3790
4025
|
};
|
|
3791
4026
|
ggml_vk_sync_buffers(subctx);
|
|
3792
4027
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
3793
|
-
{ { d_X, x_buf_offset, x_sz * ne02 * ne03 },
|
|
4028
|
+
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
|
|
4029
|
+
vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
|
|
3794
4030
|
sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
|
|
3795
4031
|
}
|
|
3796
4032
|
|
|
3797
|
-
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context
|
|
4033
|
+
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
|
|
3798
4034
|
VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
|
|
3799
4035
|
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
|
3800
|
-
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
|
|
4036
|
+
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
|
|
3801
4037
|
} else {
|
|
3802
|
-
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst);
|
|
4038
|
+
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
|
|
3803
4039
|
}
|
|
3804
4040
|
}
|
|
3805
4041
|
|
|
3806
|
-
static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3807
|
-
// guaranteed to be an integer due to the check in ggml_can_repeat
|
|
3808
|
-
const uint64_t ne0 = dst->ne[0];
|
|
3809
|
-
const uint64_t ne1 = dst->ne[1];
|
|
3810
|
-
const uint64_t ne2 = dst->ne[2];
|
|
3811
|
-
const uint64_t ne3 = dst->ne[3];
|
|
3812
|
-
|
|
3813
|
-
const uint64_t ne00 = src0->ne[0];
|
|
3814
|
-
const uint64_t ne01 = src0->ne[1];
|
|
3815
|
-
const uint64_t ne02 = src0->ne[2];
|
|
3816
|
-
const uint64_t ne03 = src0->ne[3];
|
|
3817
|
-
|
|
3818
|
-
const uint64_t nb0 = dst->nb[0];
|
|
3819
|
-
const uint64_t nb1 = dst->nb[1];
|
|
3820
|
-
const uint64_t nb2 = dst->nb[2];
|
|
3821
|
-
const uint64_t nb3 = dst->nb[3];
|
|
3822
|
-
|
|
3823
|
-
const uint64_t nb00 = src0->nb[0];
|
|
3824
|
-
const uint64_t nb01 = src0->nb[1];
|
|
3825
|
-
const uint64_t nb02 = src0->nb[2];
|
|
3826
|
-
const uint64_t nb03 = src0->nb[3];
|
|
3827
|
-
|
|
3828
|
-
const uint64_t nr0 = ne0/ne00;
|
|
3829
|
-
const uint64_t nr1 = ne1/ne01;
|
|
3830
|
-
const uint64_t nr2 = ne2/ne02;
|
|
3831
|
-
const uint64_t nr3 = ne3/ne03;
|
|
3832
|
-
|
|
3833
|
-
// TODO: support for transposed / permuted tensors
|
|
3834
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
|
3835
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
|
3836
|
-
|
|
3837
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
3838
|
-
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
|
3839
|
-
|
|
3840
|
-
const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
|
|
3841
|
-
const uint64_t src_offset = extra_src0->offset + src0->view_offs;
|
|
3842
|
-
vk_buffer dst_buf = extra->buffer_gpu.lock();
|
|
3843
|
-
const uint64_t dst_offset = extra->offset + dst->view_offs;
|
|
3844
|
-
|
|
3845
|
-
std::vector<vk::BufferCopy> copies;
|
|
3846
|
-
|
|
3847
|
-
for (uint64_t i3 = 0; i3 < nr3; i3++) {
|
|
3848
|
-
for (uint64_t k3 = 0; k3 < ne03; k3++) {
|
|
3849
|
-
for (uint64_t i2 = 0; i2 < nr2; i2++) {
|
|
3850
|
-
for (uint64_t k2 = 0; k2 < ne02; k2++) {
|
|
3851
|
-
for (uint64_t i1 = 0; i1 < nr1; i1++) {
|
|
3852
|
-
for (uint64_t k1 = 0; k1 < ne01; k1++) {
|
|
3853
|
-
for (uint64_t i0 = 0; i0 < nr0; i0++) {
|
|
3854
|
-
copies.push_back({
|
|
3855
|
-
src_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
|
3856
|
-
dst_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
|
3857
|
-
ne00*nb0,
|
|
3858
|
-
});
|
|
3859
|
-
}
|
|
3860
|
-
}
|
|
3861
|
-
}
|
|
3862
|
-
}
|
|
3863
|
-
}
|
|
3864
|
-
}
|
|
3865
|
-
}
|
|
3866
|
-
|
|
3867
|
-
ggml_vk_sync_buffers(subctx);
|
|
3868
|
-
subctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies);
|
|
3869
|
-
|
|
3870
|
-
GGML_UNUSED(ctx);
|
|
3871
|
-
GGML_UNUSED(src1);
|
|
3872
|
-
}
|
|
3873
|
-
|
|
3874
|
-
|
|
3875
4042
|
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
|
|
3876
4043
|
switch (op) {
|
|
3877
|
-
case GGML_OP_ADD:
|
|
3878
|
-
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3879
|
-
return ctx->device->pipeline_add_f32;
|
|
3880
|
-
}
|
|
3881
|
-
return nullptr;
|
|
3882
4044
|
case GGML_OP_GET_ROWS:
|
|
3883
4045
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
|
3884
4046
|
if (dst->type == GGML_TYPE_F16) {
|
|
@@ -3888,6 +4050,19 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3888
4050
|
return ctx->device->pipeline_get_rows_f32[src0->type];
|
|
3889
4051
|
}
|
|
3890
4052
|
return nullptr;
|
|
4053
|
+
case GGML_OP_ACC:
|
|
4054
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4055
|
+
return ctx->device->pipeline_acc_f32;
|
|
4056
|
+
}
|
|
4057
|
+
return nullptr;
|
|
4058
|
+
case GGML_OP_ADD:
|
|
4059
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4060
|
+
return ctx->device->pipeline_add_f32;
|
|
4061
|
+
}
|
|
4062
|
+
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
4063
|
+
return ctx->device->pipeline_add_f16_f32_f16;
|
|
4064
|
+
}
|
|
4065
|
+
return nullptr;
|
|
3891
4066
|
case GGML_OP_MUL:
|
|
3892
4067
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3893
4068
|
return ctx->device->pipeline_mul_f32;
|
|
@@ -3898,6 +4073,22 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3898
4073
|
return ctx->device->pipeline_div_f32;
|
|
3899
4074
|
}
|
|
3900
4075
|
return nullptr;
|
|
4076
|
+
case GGML_OP_CONCAT:
|
|
4077
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4078
|
+
return ctx->device->pipeline_concat_f32;
|
|
4079
|
+
}
|
|
4080
|
+
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
4081
|
+
return ctx->device->pipeline_concat_f16;
|
|
4082
|
+
}
|
|
4083
|
+
if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
|
4084
|
+
return ctx->device->pipeline_concat_i32;
|
|
4085
|
+
}
|
|
4086
|
+
return nullptr;
|
|
4087
|
+
case GGML_OP_UPSCALE:
|
|
4088
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4089
|
+
return ctx->device->pipeline_upscale_f32;
|
|
4090
|
+
}
|
|
4091
|
+
return nullptr;
|
|
3901
4092
|
case GGML_OP_SCALE:
|
|
3902
4093
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3903
4094
|
return ctx->device->pipeline_scale_f32;
|
|
@@ -3908,11 +4099,31 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3908
4099
|
return ctx->device->pipeline_sqr_f32;
|
|
3909
4100
|
}
|
|
3910
4101
|
return nullptr;
|
|
4102
|
+
case GGML_OP_SIN:
|
|
4103
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4104
|
+
return ctx->device->pipeline_sin_f32;
|
|
4105
|
+
}
|
|
4106
|
+
return nullptr;
|
|
4107
|
+
case GGML_OP_COS:
|
|
4108
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4109
|
+
return ctx->device->pipeline_cos_f32;
|
|
4110
|
+
}
|
|
4111
|
+
return nullptr;
|
|
3911
4112
|
case GGML_OP_CLAMP:
|
|
3912
4113
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3913
4114
|
return ctx->device->pipeline_clamp_f32;
|
|
3914
4115
|
}
|
|
3915
4116
|
return nullptr;
|
|
4117
|
+
case GGML_OP_PAD:
|
|
4118
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4119
|
+
return ctx->device->pipeline_pad_f32;
|
|
4120
|
+
}
|
|
4121
|
+
return nullptr;
|
|
4122
|
+
case GGML_OP_REPEAT:
|
|
4123
|
+
if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
|
|
4124
|
+
return ctx->device->pipeline_repeat_f32;
|
|
4125
|
+
}
|
|
4126
|
+
return nullptr;
|
|
3916
4127
|
case GGML_OP_CPY:
|
|
3917
4128
|
case GGML_OP_CONT:
|
|
3918
4129
|
case GGML_OP_DUP:
|
|
@@ -3922,6 +4133,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3922
4133
|
return ctx->device->pipeline_norm_f32;
|
|
3923
4134
|
}
|
|
3924
4135
|
return nullptr;
|
|
4136
|
+
case GGML_OP_GROUP_NORM:
|
|
4137
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4138
|
+
return ctx->device->pipeline_group_norm_f32;
|
|
4139
|
+
}
|
|
4140
|
+
return nullptr;
|
|
3925
4141
|
case GGML_OP_RMS_NORM:
|
|
3926
4142
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3927
4143
|
return ctx->device->pipeline_rms_norm_f32;
|
|
@@ -3939,11 +4155,21 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3939
4155
|
return ctx->device->pipeline_gelu_f32;
|
|
3940
4156
|
}
|
|
3941
4157
|
break;
|
|
4158
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
4159
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4160
|
+
return ctx->device->pipeline_gelu_quick_f32;
|
|
4161
|
+
}
|
|
4162
|
+
break;
|
|
3942
4163
|
case GGML_UNARY_OP_RELU:
|
|
3943
4164
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3944
4165
|
return ctx->device->pipeline_relu_f32;
|
|
3945
4166
|
}
|
|
3946
4167
|
break;
|
|
4168
|
+
case GGML_UNARY_OP_TANH:
|
|
4169
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4170
|
+
return ctx->device->pipeline_tanh_f32;
|
|
4171
|
+
}
|
|
4172
|
+
break;
|
|
3947
4173
|
default:
|
|
3948
4174
|
break;
|
|
3949
4175
|
}
|
|
@@ -3966,7 +4192,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3966
4192
|
case GGML_OP_ROPE:
|
|
3967
4193
|
{
|
|
3968
4194
|
const int mode = ((const int32_t *) dst->op_params)[2];
|
|
3969
|
-
const bool is_neox = mode &
|
|
4195
|
+
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
3970
4196
|
|
|
3971
4197
|
if (is_neox) {
|
|
3972
4198
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
@@ -3995,6 +4221,24 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3995
4221
|
return ctx->device->pipeline_sum_rows_f32;
|
|
3996
4222
|
}
|
|
3997
4223
|
return nullptr;
|
|
4224
|
+
case GGML_OP_IM2COL:
|
|
4225
|
+
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4226
|
+
return ctx->device->pipeline_im2col_f32;
|
|
4227
|
+
}
|
|
4228
|
+
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
4229
|
+
return ctx->device->pipeline_im2col_f32_f16;
|
|
4230
|
+
}
|
|
4231
|
+
return nullptr;
|
|
4232
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
4233
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4234
|
+
return ctx->device->pipeline_timestep_embedding_f32;
|
|
4235
|
+
}
|
|
4236
|
+
return nullptr;
|
|
4237
|
+
case GGML_OP_LEAKY_RELU:
|
|
4238
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
4239
|
+
return ctx->device->pipeline_leaky_relu_f32;
|
|
4240
|
+
}
|
|
4241
|
+
return nullptr;
|
|
3998
4242
|
default:
|
|
3999
4243
|
return nullptr;
|
|
4000
4244
|
}
|
|
@@ -4002,15 +4246,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
4002
4246
|
GGML_UNUSED(src2);
|
|
4003
4247
|
}
|
|
4004
4248
|
|
|
4005
|
-
static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
|
4006
|
-
switch(op) {
|
|
4007
|
-
case GGML_OP_REPEAT:
|
|
4008
|
-
return ggml_vk_op_repeat;
|
|
4009
|
-
default:
|
|
4010
|
-
return nullptr;
|
|
4011
|
-
}
|
|
4012
|
-
}
|
|
4013
|
-
|
|
4014
4249
|
static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
4015
4250
|
switch (op) {
|
|
4016
4251
|
case GGML_OP_CPY:
|
|
@@ -4018,9 +4253,15 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
|
4018
4253
|
case GGML_OP_ADD:
|
|
4019
4254
|
case GGML_OP_MUL:
|
|
4020
4255
|
case GGML_OP_DIV:
|
|
4256
|
+
case GGML_OP_CONCAT:
|
|
4257
|
+
case GGML_OP_UPSCALE:
|
|
4021
4258
|
case GGML_OP_SCALE:
|
|
4022
4259
|
case GGML_OP_SQR:
|
|
4260
|
+
case GGML_OP_SIN:
|
|
4261
|
+
case GGML_OP_COS:
|
|
4023
4262
|
case GGML_OP_CLAMP:
|
|
4263
|
+
case GGML_OP_PAD:
|
|
4264
|
+
case GGML_OP_REPEAT:
|
|
4024
4265
|
return true;
|
|
4025
4266
|
default:
|
|
4026
4267
|
return false;
|
|
@@ -4028,7 +4269,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
|
4028
4269
|
}
|
|
4029
4270
|
|
|
4030
4271
|
template<typename PC>
|
|
4031
|
-
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context
|
|
4272
|
+
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc, bool dryrun = false) {
|
|
4032
4273
|
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
4033
4274
|
if (src1 != nullptr) {
|
|
4034
4275
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
@@ -4036,10 +4277,11 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4036
4277
|
if (src2 != nullptr) {
|
|
4037
4278
|
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
|
4038
4279
|
}
|
|
4039
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]
|
|
4280
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
|
4281
|
+
std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
|
|
4040
4282
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
|
4041
4283
|
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
|
4042
|
-
GGML_ASSERT(dst->
|
|
4284
|
+
GGML_ASSERT(dst->buffer != nullptr);
|
|
4043
4285
|
const uint64_t ne00 = src0->ne[0];
|
|
4044
4286
|
const uint64_t ne01 = src0->ne[1];
|
|
4045
4287
|
const uint64_t ne02 = src0->ne[2];
|
|
@@ -4068,29 +4310,27 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4068
4310
|
const uint64_t ned = ned0 * ned1;
|
|
4069
4311
|
|
|
4070
4312
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
|
4071
|
-
ggml_vk_func_t op_func;
|
|
4072
4313
|
|
|
4073
4314
|
if (pipeline == nullptr) {
|
|
4074
|
-
|
|
4075
|
-
if (
|
|
4076
|
-
std::cerr << "
|
|
4077
|
-
if (src1 != nullptr) {
|
|
4078
|
-
std::cerr << " and " << ggml_type_name(src1->type);
|
|
4079
|
-
}
|
|
4080
|
-
std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
|
|
4081
|
-
GGML_ABORT("fatal error");
|
|
4315
|
+
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type);
|
|
4316
|
+
if (src1 != nullptr) {
|
|
4317
|
+
std::cerr << " and " << ggml_type_name(src1->type);
|
|
4082
4318
|
}
|
|
4319
|
+
std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
|
|
4320
|
+
GGML_ABORT("fatal error");
|
|
4321
|
+
}
|
|
4083
4322
|
|
|
4084
|
-
|
|
4323
|
+
if (dryrun) {
|
|
4324
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
|
4085
4325
|
return;
|
|
4086
4326
|
}
|
|
4087
4327
|
|
|
4088
4328
|
const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
|
|
4089
4329
|
|
|
4090
|
-
|
|
4091
|
-
|
|
4092
|
-
|
|
4093
|
-
|
|
4330
|
+
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
4331
|
+
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
|
4332
|
+
ggml_backend_vk_buffer_context * src1_buf_ctx = use_src1 ? (ggml_backend_vk_buffer_context *)src1->buffer->context : nullptr;
|
|
4333
|
+
ggml_backend_vk_buffer_context * src2_buf_ctx = use_src2 ? (ggml_backend_vk_buffer_context *)src2->buffer->context : nullptr;
|
|
4094
4334
|
|
|
4095
4335
|
vk_buffer d_X = nullptr;
|
|
4096
4336
|
size_t x_buf_offset = 0;
|
|
@@ -4121,29 +4361,29 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4121
4361
|
uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0;
|
|
4122
4362
|
uint64_t d_sz = ggml_type_size(dst->type) * ned;
|
|
4123
4363
|
|
|
4124
|
-
vk_buffer d_D =
|
|
4364
|
+
vk_buffer d_D = dst_buf_ctx->dev_buffer;
|
|
4125
4365
|
|
|
4126
4366
|
// Workaround for tiny tensor inputs on ROPE
|
|
4127
|
-
if (use_src1 && y_sz > d_D->size) {
|
|
4367
|
+
if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) {
|
|
4128
4368
|
y_sz = VK_WHOLE_SIZE;
|
|
4129
4369
|
}
|
|
4130
4370
|
|
|
4131
4371
|
GGML_ASSERT(d_D != nullptr);
|
|
4132
|
-
uint64_t d_buf_offset = ((
|
|
4133
|
-
GGML_ASSERT(d_buf_offset ==
|
|
4372
|
+
uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
|
4373
|
+
GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT
|
|
4134
4374
|
if(!src0_uma) {
|
|
4135
|
-
d_X =
|
|
4136
|
-
x_buf_offset =
|
|
4375
|
+
d_X = src0_buf_ctx->dev_buffer;
|
|
4376
|
+
x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
|
4137
4377
|
GGML_ASSERT(d_X != nullptr);
|
|
4138
4378
|
}
|
|
4139
4379
|
if (use_src1 && !src1_uma) {
|
|
4140
|
-
d_Y =
|
|
4141
|
-
y_buf_offset =
|
|
4380
|
+
d_Y = src1_buf_ctx->dev_buffer;
|
|
4381
|
+
y_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
|
|
4142
4382
|
GGML_ASSERT(d_Y != nullptr);
|
|
4143
4383
|
}
|
|
4144
4384
|
if (use_src2 && !src2_uma) {
|
|
4145
|
-
d_Z =
|
|
4146
|
-
z_buf_offset =
|
|
4385
|
+
d_Z = src2_buf_ctx->dev_buffer;
|
|
4386
|
+
z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
|
|
4147
4387
|
GGML_ASSERT(d_Z != nullptr);
|
|
4148
4388
|
}
|
|
4149
4389
|
|
|
@@ -4170,127 +4410,143 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4170
4410
|
std::array<uint32_t, 3> elements;
|
|
4171
4411
|
|
|
4172
4412
|
// Single call if dimension 2 is contiguous
|
|
4173
|
-
|
|
4174
|
-
ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1);
|
|
4413
|
+
GGML_ASSERT(op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1))));
|
|
4175
4414
|
|
|
4176
|
-
|
|
4177
|
-
|
|
4178
|
-
|
|
4179
|
-
|
|
4180
|
-
|
|
4181
|
-
|
|
4182
|
-
|
|
4183
|
-
|
|
4184
|
-
|
|
4185
|
-
|
|
4186
|
-
|
|
4187
|
-
case GGML_OP_GET_ROWS:
|
|
4188
|
-
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
|
4189
|
-
break;
|
|
4190
|
-
case GGML_OP_ARGSORT:
|
|
4191
|
-
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
|
|
4192
|
-
break;
|
|
4193
|
-
default:
|
|
4194
|
-
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
|
4195
|
-
break;
|
|
4196
|
-
}
|
|
4197
|
-
|
|
4198
|
-
if (!op_supports_incontiguous) {
|
|
4199
|
-
if (x_sz != VK_WHOLE_SIZE) {
|
|
4200
|
-
x_sz *= ne02 * ne03;
|
|
4201
|
-
}
|
|
4202
|
-
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
|
4203
|
-
y_sz *= ne12 * ne13;
|
|
4204
|
-
}
|
|
4205
|
-
if (use_src2 && z_sz != VK_WHOLE_SIZE) {
|
|
4206
|
-
z_sz *= ne22 * ne23;
|
|
4207
|
-
}
|
|
4208
|
-
if (d_sz != VK_WHOLE_SIZE) {
|
|
4209
|
-
d_sz *= ned2 * ned3;
|
|
4210
|
-
}
|
|
4211
|
-
}
|
|
4212
|
-
|
|
4213
|
-
if (op == GGML_OP_SOFT_MAX) {
|
|
4214
|
-
// Empty src1 is possible in soft_max, but the shader needs a buffer
|
|
4215
|
-
vk_subbuffer subbuf_y;
|
|
4216
|
-
if (use_src1) {
|
|
4217
|
-
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
|
4415
|
+
switch (op) {
|
|
4416
|
+
case GGML_OP_NORM:
|
|
4417
|
+
case GGML_OP_RMS_NORM:
|
|
4418
|
+
case GGML_OP_SOFT_MAX:
|
|
4419
|
+
case GGML_OP_SUM_ROWS:
|
|
4420
|
+
{
|
|
4421
|
+
const uint32_t nr = ggml_nrows(src0);
|
|
4422
|
+
if (nr > 262144) {
|
|
4423
|
+
elements = { 512, 512, CEIL_DIV(nr, 262144) };
|
|
4424
|
+
} else if (nr > 512) {
|
|
4425
|
+
elements = { 512, CEIL_DIV(nr, 512), 1 };
|
|
4218
4426
|
} else {
|
|
4219
|
-
|
|
4427
|
+
elements = { nr, 1, 1 };
|
|
4220
4428
|
}
|
|
4429
|
+
} break;
|
|
4430
|
+
case GGML_OP_GROUP_NORM:
|
|
4431
|
+
{
|
|
4432
|
+
const uint32_t num_groups = dst->op_params[0];
|
|
4433
|
+
elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 };
|
|
4434
|
+
} break;
|
|
4435
|
+
case GGML_OP_DIAG_MASK_INF:
|
|
4436
|
+
case GGML_OP_ROPE:
|
|
4437
|
+
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
|
4438
|
+
break;
|
|
4439
|
+
case GGML_OP_GET_ROWS:
|
|
4440
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
|
4441
|
+
break;
|
|
4442
|
+
case GGML_OP_ARGSORT:
|
|
4443
|
+
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
|
|
4444
|
+
break;
|
|
4445
|
+
case GGML_OP_IM2COL:
|
|
4446
|
+
{
|
|
4447
|
+
const bool is_2D = dst->op_params[6] == 1;
|
|
4221
4448
|
|
|
4222
|
-
|
|
4223
|
-
|
|
4224
|
-
|
|
4225
|
-
|
|
4226
|
-
|
|
4227
|
-
|
|
4228
|
-
|
|
4449
|
+
const uint32_t IC = src1->ne[is_2D ? 2 : 1];
|
|
4450
|
+
|
|
4451
|
+
const uint32_t KH = is_2D ? src0->ne[1] : 1;
|
|
4452
|
+
const uint32_t KW = src0->ne[0];
|
|
4453
|
+
|
|
4454
|
+
const uint32_t OH = is_2D ? dst->ne[2] : 1;
|
|
4455
|
+
const uint32_t OW = dst->ne[1];
|
|
4456
|
+
|
|
4457
|
+
const uint32_t batch = src1->ne[3];
|
|
4458
|
+
|
|
4459
|
+
elements = { OW * KW * KH, OH, batch * IC };
|
|
4460
|
+
} break;
|
|
4461
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
4462
|
+
{
|
|
4463
|
+
const uint32_t dim = dst->op_params[0];
|
|
4464
|
+
uint32_t half_ceil = (dim + 1) / 2;
|
|
4465
|
+
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
|
4466
|
+
} break;
|
|
4467
|
+
case GGML_OP_ADD:
|
|
4468
|
+
case GGML_OP_DIV:
|
|
4469
|
+
case GGML_OP_MUL:
|
|
4470
|
+
case GGML_OP_SCALE:
|
|
4471
|
+
case GGML_OP_SQR:
|
|
4472
|
+
case GGML_OP_SIN:
|
|
4473
|
+
case GGML_OP_COS:
|
|
4474
|
+
case GGML_OP_CLAMP:
|
|
4475
|
+
case GGML_OP_PAD:
|
|
4476
|
+
case GGML_OP_REPEAT:
|
|
4477
|
+
case GGML_OP_CPY:
|
|
4478
|
+
case GGML_OP_CONCAT:
|
|
4479
|
+
case GGML_OP_UPSCALE:
|
|
4480
|
+
case GGML_OP_UNARY:
|
|
4481
|
+
{
|
|
4482
|
+
const uint32_t ne = ggml_nelements(dst);
|
|
4483
|
+
if (ne > 262144) {
|
|
4484
|
+
elements = { 512, 512, CEIL_DIV(ne, 262144) };
|
|
4485
|
+
} else if (ne > 512) {
|
|
4486
|
+
elements = { 512, CEIL_DIV(ne, 512), 1 };
|
|
4229
4487
|
} else {
|
|
4230
|
-
|
|
4488
|
+
elements = { ne, 1, 1 };
|
|
4231
4489
|
}
|
|
4490
|
+
} break;
|
|
4491
|
+
default:
|
|
4492
|
+
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
|
4493
|
+
break;
|
|
4494
|
+
}
|
|
4232
4495
|
|
|
4233
|
-
|
|
4234
|
-
|
|
4235
|
-
|
|
4236
|
-
ggml_vk_sync_buffers(subctx);
|
|
4237
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4238
|
-
} else if (use_src1) {
|
|
4239
|
-
ggml_vk_sync_buffers(subctx);
|
|
4240
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4241
|
-
} else {
|
|
4242
|
-
ggml_vk_sync_buffers(subctx);
|
|
4243
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4496
|
+
if (!op_supports_incontiguous) {
|
|
4497
|
+
if (x_sz != VK_WHOLE_SIZE) {
|
|
4498
|
+
x_sz *= ne02 * ne03;
|
|
4244
4499
|
}
|
|
4245
|
-
|
|
4246
|
-
|
|
4247
|
-
|
|
4248
|
-
|
|
4249
|
-
|
|
4250
|
-
|
|
4251
|
-
|
|
4252
|
-
|
|
4253
|
-
case GGML_OP_NORM:
|
|
4254
|
-
case GGML_OP_RMS_NORM:
|
|
4255
|
-
elements = { (uint32_t)ne01, 1, 1 };
|
|
4256
|
-
break;
|
|
4257
|
-
case GGML_OP_DIAG_MASK_INF:
|
|
4258
|
-
case GGML_OP_ROPE:
|
|
4259
|
-
elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
|
|
4260
|
-
break;
|
|
4261
|
-
case GGML_OP_GET_ROWS:
|
|
4262
|
-
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
|
4263
|
-
break;
|
|
4264
|
-
default:
|
|
4265
|
-
elements = { (uint32_t)ne0, 1, 1 };
|
|
4266
|
-
break;
|
|
4500
|
+
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
|
4501
|
+
y_sz *= ne12 * ne13;
|
|
4502
|
+
}
|
|
4503
|
+
if (use_src2 && z_sz != VK_WHOLE_SIZE) {
|
|
4504
|
+
z_sz *= ne22 * ne23;
|
|
4505
|
+
}
|
|
4506
|
+
if (d_sz != VK_WHOLE_SIZE) {
|
|
4507
|
+
d_sz *= ned2 * ned3;
|
|
4267
4508
|
}
|
|
4509
|
+
}
|
|
4268
4510
|
|
|
4269
|
-
|
|
4270
|
-
|
|
4271
|
-
|
|
4272
|
-
|
|
4273
|
-
|
|
4274
|
-
|
|
4275
|
-
|
|
4511
|
+
if (op == GGML_OP_SOFT_MAX) {
|
|
4512
|
+
// Empty src1 is possible in soft_max, but the shader needs a buffer
|
|
4513
|
+
vk_subbuffer subbuf_y;
|
|
4514
|
+
if (use_src1) {
|
|
4515
|
+
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
|
4516
|
+
} else {
|
|
4517
|
+
subbuf_y = { d_X, 0, x_sz };
|
|
4518
|
+
}
|
|
4276
4519
|
|
|
4277
|
-
|
|
4278
|
-
|
|
4279
|
-
|
|
4280
|
-
|
|
4281
|
-
|
|
4282
|
-
|
|
4283
|
-
|
|
4284
|
-
|
|
4520
|
+
ggml_vk_sync_buffers(subctx);
|
|
4521
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4522
|
+
} else if (op == GGML_OP_ROPE) {
|
|
4523
|
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
4524
|
+
vk_subbuffer subbuf_z;
|
|
4525
|
+
if (use_src2) {
|
|
4526
|
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
|
4527
|
+
} else {
|
|
4528
|
+
subbuf_z = { d_X, 0, x_sz };
|
|
4285
4529
|
}
|
|
4286
|
-
}
|
|
4287
|
-
}
|
|
4288
4530
|
|
|
4289
|
-
|
|
4290
|
-
|
|
4531
|
+
ggml_vk_sync_buffers(subctx);
|
|
4532
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4533
|
+
} else if (op == GGML_OP_IM2COL) {
|
|
4534
|
+
// im2col uses only src1 and dst buffers
|
|
4535
|
+
ggml_vk_sync_buffers(subctx);
|
|
4536
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4537
|
+
} else if (use_src2) {
|
|
4538
|
+
ggml_vk_sync_buffers(subctx);
|
|
4539
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4540
|
+
} else if (use_src1) {
|
|
4541
|
+
ggml_vk_sync_buffers(subctx);
|
|
4542
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4543
|
+
} else {
|
|
4544
|
+
ggml_vk_sync_buffers(subctx);
|
|
4545
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4546
|
+
}
|
|
4291
4547
|
}
|
|
4292
4548
|
|
|
4293
|
-
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context
|
|
4549
|
+
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
4294
4550
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4295
4551
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
4296
4552
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
@@ -4301,11 +4557,32 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
4301
4557
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
4302
4558
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4303
4559
|
0,
|
|
4304
|
-
0.0f, 0.0f,
|
|
4305
|
-
});
|
|
4560
|
+
0.0f, 0.0f, 0,
|
|
4561
|
+
}, dryrun);
|
|
4306
4562
|
}
|
|
4307
4563
|
|
|
4308
|
-
static void
|
|
4564
|
+
static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
4565
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4566
|
+
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
4567
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4568
|
+
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
|
4569
|
+
|
|
4570
|
+
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
|
4571
|
+
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
|
4572
|
+
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
|
4573
|
+
int offset = dst->op_params[3] / 4; // offset in bytes
|
|
4574
|
+
|
|
4575
|
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ACC, {
|
|
4576
|
+
(uint32_t)ggml_nelements(src0),
|
|
4577
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
|
|
4578
|
+
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
4579
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4580
|
+
d_offset,
|
|
4581
|
+
0.0f, 0.0f, offset,
|
|
4582
|
+
}, dryrun);
|
|
4583
|
+
}
|
|
4584
|
+
|
|
4585
|
+
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
4309
4586
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4310
4587
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
4311
4588
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
@@ -4316,11 +4593,11 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
|
4316
4593
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
4317
4594
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4318
4595
|
0,
|
|
4319
|
-
0.0f, 0.0f,
|
|
4320
|
-
});
|
|
4596
|
+
0.0f, 0.0f, 0,
|
|
4597
|
+
}, dryrun);
|
|
4321
4598
|
}
|
|
4322
4599
|
|
|
4323
|
-
static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context
|
|
4600
|
+
static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
4324
4601
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4325
4602
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
4326
4603
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
@@ -4331,11 +4608,11 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
|
4331
4608
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
4332
4609
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4333
4610
|
0,
|
|
4334
|
-
0.0f, 0.0f,
|
|
4335
|
-
});
|
|
4611
|
+
0.0f, 0.0f, 0,
|
|
4612
|
+
}, dryrun);
|
|
4336
4613
|
}
|
|
4337
4614
|
|
|
4338
|
-
static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context
|
|
4615
|
+
static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
4339
4616
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4340
4617
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
4341
4618
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
@@ -4346,11 +4623,44 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
|
4346
4623
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
4347
4624
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4348
4625
|
0,
|
|
4349
|
-
0.0f, 0.0f,
|
|
4350
|
-
});
|
|
4626
|
+
0.0f, 0.0f, 0,
|
|
4627
|
+
}, dryrun);
|
|
4351
4628
|
}
|
|
4352
4629
|
|
|
4353
|
-
static void
|
|
4630
|
+
static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
4631
|
+
int * op_params = (int *)dst->op_params;
|
|
4632
|
+
|
|
4633
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4634
|
+
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
4635
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4636
|
+
|
|
4637
|
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONCAT, {
|
|
4638
|
+
(uint32_t)ggml_nelements(dst),
|
|
4639
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
4640
|
+
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
4641
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4642
|
+
0,
|
|
4643
|
+
0.0f, 0.0f, op_params[0],
|
|
4644
|
+
}, dryrun);
|
|
4645
|
+
}
|
|
4646
|
+
|
|
4647
|
+
static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4648
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4649
|
+
|
|
4650
|
+
const float sf0 = (float)dst->ne[0] / src0->ne[0];
|
|
4651
|
+
const float sf1 = (float)dst->ne[1] / src0->ne[1];
|
|
4652
|
+
const float sf2 = (float)dst->ne[2] / src0->ne[2];
|
|
4653
|
+
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
|
4654
|
+
|
|
4655
|
+
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
|
4656
|
+
(uint32_t)ggml_nelements(dst), 0,
|
|
4657
|
+
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
4658
|
+
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
|
|
4659
|
+
sf0, sf1, sf2, sf3,
|
|
4660
|
+
}, dryrun);
|
|
4661
|
+
}
|
|
4662
|
+
|
|
4663
|
+
static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4354
4664
|
float * op_params = (float *)dst->op_params;
|
|
4355
4665
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4356
4666
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
@@ -4361,10 +4671,10 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co
|
|
|
4361
4671
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4362
4672
|
0,
|
|
4363
4673
|
op_params[0], 0.0f
|
|
4364
|
-
});
|
|
4674
|
+
}, dryrun);
|
|
4365
4675
|
}
|
|
4366
4676
|
|
|
4367
|
-
static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context
|
|
4677
|
+
static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4368
4678
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4369
4679
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4370
4680
|
|
|
@@ -4374,10 +4684,36 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
|
4374
4684
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4375
4685
|
0,
|
|
4376
4686
|
0.0f, 0.0f,
|
|
4377
|
-
});
|
|
4687
|
+
}, dryrun);
|
|
4688
|
+
}
|
|
4689
|
+
|
|
4690
|
+
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4691
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4692
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4693
|
+
|
|
4694
|
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
|
|
4695
|
+
(uint32_t)ggml_nelements(src0),
|
|
4696
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
4697
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4698
|
+
0,
|
|
4699
|
+
0.0f, 0.0f,
|
|
4700
|
+
}, dryrun);
|
|
4701
|
+
}
|
|
4702
|
+
|
|
4703
|
+
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4704
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4705
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4706
|
+
|
|
4707
|
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
|
|
4708
|
+
(uint32_t)ggml_nelements(src0),
|
|
4709
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
4710
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4711
|
+
0,
|
|
4712
|
+
0.0f, 0.0f,
|
|
4713
|
+
}, dryrun);
|
|
4378
4714
|
}
|
|
4379
4715
|
|
|
4380
|
-
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context
|
|
4716
|
+
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4381
4717
|
float * op_params = (float *)dst->op_params;
|
|
4382
4718
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4383
4719
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
@@ -4387,15 +4723,40 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co
|
|
|
4387
4723
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
4388
4724
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4389
4725
|
0,
|
|
4390
|
-
op_params[0], op_params[1],
|
|
4391
|
-
});
|
|
4726
|
+
op_params[0], op_params[1],
|
|
4727
|
+
}, dryrun);
|
|
4728
|
+
}
|
|
4729
|
+
|
|
4730
|
+
static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4731
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4732
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4733
|
+
|
|
4734
|
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, {
|
|
4735
|
+
(uint32_t)ggml_nelements(dst),
|
|
4736
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
4737
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4738
|
+
0,
|
|
4739
|
+
0.0f, 0.0f,
|
|
4740
|
+
}, dryrun);
|
|
4741
|
+
}
|
|
4742
|
+
|
|
4743
|
+
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4744
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4745
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4746
|
+
|
|
4747
|
+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {
|
|
4748
|
+
(uint32_t)ggml_nelements(dst),
|
|
4749
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
4750
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4751
|
+
0,
|
|
4752
|
+
0.0f, 0.0f,
|
|
4753
|
+
}, dryrun);
|
|
4392
4754
|
}
|
|
4393
4755
|
|
|
4394
|
-
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context
|
|
4395
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
4756
|
+
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4396
4757
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4397
4758
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4398
|
-
const uint32_t d_offset = ((
|
|
4759
|
+
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
|
4399
4760
|
|
|
4400
4761
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
|
4401
4762
|
(uint32_t)ggml_nelements(src0),
|
|
@@ -4403,30 +4764,41 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
|
4403
4764
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4404
4765
|
d_offset,
|
|
4405
4766
|
0.0f, 0.0f,
|
|
4406
|
-
});
|
|
4767
|
+
}, dryrun);
|
|
4407
4768
|
}
|
|
4408
4769
|
|
|
4409
|
-
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context
|
|
4770
|
+
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4410
4771
|
float * op_params = (float *)dst->op_params;
|
|
4411
4772
|
|
|
4412
|
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
|
4773
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
|
4774
|
+
}
|
|
4775
|
+
|
|
4776
|
+
static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4777
|
+
const int * int_op_params = (const int *)dst->op_params;
|
|
4778
|
+
const float * float_op_params = (const float *)dst->op_params;
|
|
4779
|
+
|
|
4780
|
+
const uint32_t num_groups = int_op_params[0];
|
|
4781
|
+
const float eps = float_op_params[1];
|
|
4782
|
+
const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
|
4783
|
+
|
|
4784
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun);
|
|
4413
4785
|
}
|
|
4414
4786
|
|
|
4415
|
-
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context
|
|
4787
|
+
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4416
4788
|
float * op_params = (float *)dst->op_params;
|
|
4417
|
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
|
4789
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
|
4418
4790
|
}
|
|
4419
4791
|
|
|
4420
|
-
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context
|
|
4421
|
-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
|
4792
|
+
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4793
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
|
|
4422
4794
|
}
|
|
4423
4795
|
|
|
4424
|
-
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context
|
|
4796
|
+
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4425
4797
|
int32_t * op_params = (int32_t *)dst->op_params;
|
|
4426
|
-
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
|
4798
|
+
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
|
|
4427
4799
|
}
|
|
4428
4800
|
|
|
4429
|
-
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context
|
|
4801
|
+
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
4430
4802
|
float * op_params = (float *)dst->op_params;
|
|
4431
4803
|
|
|
4432
4804
|
float scale = op_params[0];
|
|
@@ -4448,10 +4820,10 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
4448
4820
|
scale, max_bias,
|
|
4449
4821
|
m0, m1,
|
|
4450
4822
|
n_head_log2,
|
|
4451
|
-
});
|
|
4823
|
+
}, dryrun);
|
|
4452
4824
|
}
|
|
4453
4825
|
|
|
4454
|
-
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context
|
|
4826
|
+
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
|
|
4455
4827
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
4456
4828
|
// const int mode = ((int32_t *) dst->op_params)[2];
|
|
4457
4829
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
@@ -4472,10 +4844,10 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
|
4472
4844
|
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
|
4473
4845
|
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
|
|
4474
4846
|
src2 != nullptr,
|
|
4475
|
-
});
|
|
4847
|
+
}, dryrun);
|
|
4476
4848
|
}
|
|
4477
4849
|
|
|
4478
|
-
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context
|
|
4850
|
+
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4479
4851
|
int32_t * op_params = (int32_t *)dst->op_params;
|
|
4480
4852
|
|
|
4481
4853
|
uint32_t ncols = src0->ne[0];
|
|
@@ -4491,11 +4863,60 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
4491
4863
|
ncols,
|
|
4492
4864
|
ncols_pad,
|
|
4493
4865
|
op_params[0],
|
|
4494
|
-
});
|
|
4866
|
+
}, dryrun);
|
|
4867
|
+
}
|
|
4868
|
+
|
|
4869
|
+
static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4870
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }, dryrun);
|
|
4871
|
+
}
|
|
4872
|
+
|
|
4873
|
+
static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
4874
|
+
const int32_t s0 = dst->op_params[0];
|
|
4875
|
+
const int32_t s1 = dst->op_params[1];
|
|
4876
|
+
const int32_t p0 = dst->op_params[2];
|
|
4877
|
+
const int32_t p1 = dst->op_params[3];
|
|
4878
|
+
const int32_t d0 = dst->op_params[4];
|
|
4879
|
+
const int32_t d1 = dst->op_params[5];
|
|
4880
|
+
|
|
4881
|
+
const bool is_2D = dst->op_params[6] == 1;
|
|
4882
|
+
|
|
4883
|
+
const uint32_t IC = src1->ne[is_2D ? 2 : 1];
|
|
4884
|
+
const uint32_t IH = is_2D ? src1->ne[1] : 1;
|
|
4885
|
+
const uint32_t IW = src1->ne[0];
|
|
4886
|
+
|
|
4887
|
+
const uint32_t KH = is_2D ? src0->ne[1] : 1;
|
|
4888
|
+
const uint32_t KW = src0->ne[0];
|
|
4889
|
+
|
|
4890
|
+
const uint32_t OH = is_2D ? dst->ne[2] : 1;
|
|
4891
|
+
const uint32_t OW = dst->ne[1];
|
|
4892
|
+
|
|
4893
|
+
const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
|
4894
|
+
const uint32_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
|
|
4895
|
+
|
|
4896
|
+
const uint32_t pelements = OW * KW * KH;
|
|
4897
|
+
|
|
4898
|
+
ggml_vk_op_f32<vk_op_im2col_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL, {
|
|
4899
|
+
batch_offset, offset_delta,
|
|
4900
|
+
IC, IW, IH, OW, OH, KW, KH,
|
|
4901
|
+
pelements,
|
|
4902
|
+
IC * KH * KW,
|
|
4903
|
+
s0, s1, p0, p1, d0, d1,
|
|
4904
|
+
}, dryrun);
|
|
4905
|
+
}
|
|
4906
|
+
|
|
4907
|
+
static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4908
|
+
const uint32_t dim = dst->op_params[0];
|
|
4909
|
+
const uint32_t max_period = dst->op_params[1];
|
|
4910
|
+
const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type);
|
|
4911
|
+
|
|
4912
|
+
ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, {
|
|
4913
|
+
nb1, dim, max_period,
|
|
4914
|
+
}, dryrun);
|
|
4495
4915
|
}
|
|
4496
4916
|
|
|
4497
|
-
static void
|
|
4498
|
-
|
|
4917
|
+
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
4918
|
+
const float * op_params = (const float *)dst->op_params;
|
|
4919
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
|
|
4499
4920
|
}
|
|
4500
4921
|
|
|
4501
4922
|
#ifdef GGML_VULKAN_RUN_TESTS
|
|
@@ -4641,9 +5062,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
4641
5062
|
}
|
|
4642
5063
|
}
|
|
4643
5064
|
|
|
4644
|
-
|
|
5065
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
|
|
4645
5066
|
if (split_k > 1) {
|
|
4646
|
-
|
|
5067
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
|
4647
5068
|
|
|
4648
5069
|
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
|
4649
5070
|
// Resize buffer
|
|
@@ -4654,6 +5075,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
4654
5075
|
}
|
|
4655
5076
|
}
|
|
4656
5077
|
|
|
5078
|
+
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
|
5079
|
+
|
|
4657
5080
|
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
4658
5081
|
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
4659
5082
|
vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
@@ -4686,7 +5109,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
4686
5109
|
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
|
|
4687
5110
|
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
|
|
4688
5111
|
|
|
4689
|
-
vk_context
|
|
5112
|
+
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
|
4690
5113
|
for (size_t i = 0; i < num_it; i++) {
|
|
4691
5114
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
|
4692
5115
|
ggml_vk_matmul(
|
|
@@ -4770,7 +5193,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
4770
5193
|
|
|
4771
5194
|
avg_err /= m * n;
|
|
4772
5195
|
|
|
4773
|
-
|
|
5196
|
+
double tflops = 2.0*m*n*k*batch*num_it / (time / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
|
|
5197
|
+
|
|
5198
|
+
std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
|
|
4774
5199
|
|
|
4775
5200
|
if (avg_err > 0.1) {
|
|
4776
5201
|
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
|
@@ -4890,14 +5315,16 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
|
4890
5315
|
ggml_vk_quantize_data(x, qx, ne, quant);
|
|
4891
5316
|
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
|
|
4892
5317
|
|
|
4893
|
-
|
|
5318
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
|
|
5319
|
+
|
|
5320
|
+
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
|
4894
5321
|
|
|
4895
5322
|
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
|
4896
5323
|
|
|
4897
|
-
vk_context
|
|
5324
|
+
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
|
4898
5325
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
|
4899
5326
|
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
|
4900
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
|
|
5327
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
|
|
4901
5328
|
ggml_vk_ctx_end(subctx);
|
|
4902
5329
|
|
|
4903
5330
|
auto begin = std::chrono::high_resolution_clock::now();
|
|
@@ -5011,9 +5438,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
5011
5438
|
y[i] = (i % k == i / k) ? 1.0f : 0.0f;
|
|
5012
5439
|
}
|
|
5013
5440
|
|
|
5014
|
-
|
|
5441
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
|
|
5015
5442
|
if (split_k > 1) {
|
|
5016
|
-
|
|
5443
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
|
5017
5444
|
|
|
5018
5445
|
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
|
5019
5446
|
// Resize buffer
|
|
@@ -5024,10 +5451,12 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
5024
5451
|
}
|
|
5025
5452
|
}
|
|
5026
5453
|
|
|
5454
|
+
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
|
5455
|
+
|
|
5027
5456
|
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
|
5028
5457
|
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
|
5029
5458
|
|
|
5030
|
-
vk_context
|
|
5459
|
+
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
|
5031
5460
|
for (size_t i = 0; i < num_it; i++) {
|
|
5032
5461
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
|
5033
5462
|
ggml_vk_matmul(
|
|
@@ -5091,7 +5520,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
5091
5520
|
|
|
5092
5521
|
avg_err /= m * n;
|
|
5093
5522
|
|
|
5094
|
-
|
|
5523
|
+
double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
|
|
5524
|
+
|
|
5525
|
+
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
|
|
5095
5526
|
|
|
5096
5527
|
if (avg_err > 0.01 || std::isnan(avg_err)) {
|
|
5097
5528
|
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
|
@@ -5133,132 +5564,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
5133
5564
|
}
|
|
5134
5565
|
#endif
|
|
5135
5566
|
|
|
5136
|
-
static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
|
|
5137
|
-
VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
|
|
5138
|
-
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
|
5139
|
-
extra->reset();
|
|
5140
|
-
tensor->extra = extra;
|
|
5141
|
-
return extra;
|
|
5142
|
-
}
|
|
5143
|
-
|
|
5144
|
-
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
|
5145
|
-
VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
|
|
5146
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
|
5147
|
-
|
|
5148
|
-
if (extra == nullptr) {
|
|
5149
|
-
return;
|
|
5150
|
-
}
|
|
5151
|
-
|
|
5152
|
-
ggml_tensor * src0 = node->src[0];
|
|
5153
|
-
ggml_tensor * src1 = node->src[1];
|
|
5154
|
-
|
|
5155
|
-
const bool use_src0 = src0 != nullptr;
|
|
5156
|
-
const int64_t ne00 = use_src0 ? src0->ne[0] : 0;
|
|
5157
|
-
const int64_t ne01 = use_src0 ? src0->ne[1] : 0;
|
|
5158
|
-
const int64_t ne02 = use_src0 ? src0->ne[2] : 0;
|
|
5159
|
-
const int64_t ne03 = use_src0 ? src0->ne[3] : 0;
|
|
5160
|
-
const bool use_src1 = src1 != nullptr && node->op != GGML_OP_CPY && node->op != GGML_OP_CONT && node->op != GGML_OP_DUP;
|
|
5161
|
-
const int64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
|
5162
|
-
const int64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
|
5163
|
-
const int64_t ne12 = use_src1 ? src1->ne[2] : 0;
|
|
5164
|
-
const int64_t ne13 = use_src1 ? src1->ne[3] : 0;
|
|
5165
|
-
const int64_t ne20 = node->ne[0];
|
|
5166
|
-
const int64_t ne21 = node->ne[1];
|
|
5167
|
-
const int64_t ne22 = node->ne[2];
|
|
5168
|
-
const int64_t ne23 = node->ne[3];
|
|
5169
|
-
|
|
5170
|
-
const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
|
|
5171
|
-
const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
|
|
5172
|
-
|
|
5173
|
-
const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
|
|
5174
|
-
const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
|
|
5175
|
-
|
|
5176
|
-
const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
|
|
5177
|
-
|
|
5178
|
-
bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
|
5179
|
-
|
|
5180
|
-
const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
|
|
5181
|
-
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
|
5182
|
-
|
|
5183
|
-
int split_k;
|
|
5184
|
-
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
|
5185
|
-
split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
|
|
5186
|
-
} else {
|
|
5187
|
-
split_k = 1;
|
|
5188
|
-
}
|
|
5189
|
-
const uint32_t x_ne = ne00 * ne01;
|
|
5190
|
-
const uint32_t y_ne = ne10 * ne11;
|
|
5191
|
-
const uint32_t d_ne = ne20 * ne21;
|
|
5192
|
-
|
|
5193
|
-
const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
|
5194
|
-
const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
|
5195
|
-
uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
|
|
5196
|
-
const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
|
|
5197
|
-
|
|
5198
|
-
if (extra->buffer_gpu.expired()) {
|
|
5199
|
-
// Workaround for CPU backend BLAS matmul calls
|
|
5200
|
-
extra->buffer_gpu = ggml_vk_create_buffer_temp(ctx, d_sz);
|
|
5201
|
-
}
|
|
5202
|
-
|
|
5203
|
-
switch (node->op) {
|
|
5204
|
-
case GGML_OP_REPEAT:
|
|
5205
|
-
case GGML_OP_GET_ROWS:
|
|
5206
|
-
case GGML_OP_RESHAPE:
|
|
5207
|
-
case GGML_OP_VIEW:
|
|
5208
|
-
case GGML_OP_PERMUTE:
|
|
5209
|
-
case GGML_OP_TRANSPOSE:
|
|
5210
|
-
case GGML_OP_ADD:
|
|
5211
|
-
case GGML_OP_SCALE:
|
|
5212
|
-
case GGML_OP_SQR:
|
|
5213
|
-
case GGML_OP_CLAMP:
|
|
5214
|
-
case GGML_OP_CPY:
|
|
5215
|
-
case GGML_OP_CONT:
|
|
5216
|
-
case GGML_OP_DUP:
|
|
5217
|
-
case GGML_OP_MUL:
|
|
5218
|
-
case GGML_OP_DIV:
|
|
5219
|
-
case GGML_OP_NORM:
|
|
5220
|
-
case GGML_OP_RMS_NORM:
|
|
5221
|
-
case GGML_OP_DIAG_MASK_INF:
|
|
5222
|
-
case GGML_OP_SOFT_MAX:
|
|
5223
|
-
case GGML_OP_ROPE:
|
|
5224
|
-
case GGML_OP_ARGSORT:
|
|
5225
|
-
case GGML_OP_SUM_ROWS:
|
|
5226
|
-
break;
|
|
5227
|
-
case GGML_OP_UNARY:
|
|
5228
|
-
switch (ggml_get_unary_op(node)) {
|
|
5229
|
-
case GGML_UNARY_OP_SILU:
|
|
5230
|
-
case GGML_UNARY_OP_GELU:
|
|
5231
|
-
case GGML_UNARY_OP_RELU:
|
|
5232
|
-
break;
|
|
5233
|
-
default:
|
|
5234
|
-
return;
|
|
5235
|
-
}
|
|
5236
|
-
break;
|
|
5237
|
-
case GGML_OP_MUL_MAT:
|
|
5238
|
-
case GGML_OP_MUL_MAT_ID:
|
|
5239
|
-
if (ctx->prealloc_size_x < x_sz) {
|
|
5240
|
-
ctx->prealloc_size_x = x_sz;
|
|
5241
|
-
}
|
|
5242
|
-
if (ctx->prealloc_size_y < y_sz) {
|
|
5243
|
-
ctx->prealloc_size_y = y_sz;
|
|
5244
|
-
}
|
|
5245
|
-
if (ctx->prealloc_size_split_k < split_k_size) {
|
|
5246
|
-
ctx->prealloc_size_split_k = split_k_size;
|
|
5247
|
-
}
|
|
5248
|
-
if (ctx->staging_size < x_sz + y_sz) {
|
|
5249
|
-
ctx->staging_size = x_sz + y_sz;
|
|
5250
|
-
}
|
|
5251
|
-
break;
|
|
5252
|
-
default:
|
|
5253
|
-
return;
|
|
5254
|
-
}
|
|
5255
|
-
}
|
|
5256
|
-
|
|
5257
5567
|
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5258
5568
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
|
5259
|
-
ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
|
|
5260
|
-
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
|
5261
|
-
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
5262
5569
|
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
|
|
5263
5570
|
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
|
|
5264
5571
|
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
|
|
@@ -5418,28 +5725,19 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
|
5418
5725
|
}
|
|
5419
5726
|
ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_split_k);
|
|
5420
5727
|
}
|
|
5421
|
-
if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
|
|
5422
|
-
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
|
|
5423
|
-
// Resize buffer
|
|
5424
|
-
if (ctx->staging != nullptr) {
|
|
5425
|
-
ggml_vk_destroy_buffer(ctx->staging);
|
|
5426
|
-
}
|
|
5427
|
-
ctx->staging = ggml_vk_create_buffer_check(ctx->device, ctx->staging_size,
|
|
5428
|
-
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
|
5429
|
-
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
5430
|
-
}
|
|
5431
5728
|
}
|
|
5432
5729
|
|
|
5433
|
-
static
|
|
5434
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
|
5730
|
+
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
|
|
5435
5731
|
|
|
5436
|
-
|
|
5437
|
-
|
|
5732
|
+
// Returns true if node has enqueued work into the queue, false otherwise
|
|
5733
|
+
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
|
|
5734
|
+
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
|
|
5735
|
+
if (ggml_is_empty(node) || !node->buffer) {
|
|
5736
|
+
return false;
|
|
5438
5737
|
}
|
|
5439
5738
|
|
|
5440
5739
|
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
|
5441
5740
|
ctx->semaphore_idx = 0;
|
|
5442
|
-
ctx->staging_offset = 0;
|
|
5443
5741
|
|
|
5444
5742
|
const ggml_tensor * src0 = node->src[0];
|
|
5445
5743
|
const ggml_tensor * src1 = node->src[1];
|
|
@@ -5452,29 +5750,38 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5452
5750
|
case GGML_OP_PERMUTE:
|
|
5453
5751
|
case GGML_OP_TRANSPOSE:
|
|
5454
5752
|
case GGML_OP_NONE:
|
|
5455
|
-
return;
|
|
5753
|
+
return false;
|
|
5456
5754
|
case GGML_OP_UNARY:
|
|
5457
5755
|
switch (ggml_get_unary_op(node)) {
|
|
5458
5756
|
case GGML_UNARY_OP_SILU:
|
|
5459
5757
|
case GGML_UNARY_OP_GELU:
|
|
5758
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
5460
5759
|
case GGML_UNARY_OP_RELU:
|
|
5760
|
+
case GGML_UNARY_OP_TANH:
|
|
5461
5761
|
break;
|
|
5462
5762
|
default:
|
|
5463
|
-
return;
|
|
5763
|
+
return false;
|
|
5464
5764
|
}
|
|
5465
5765
|
break;
|
|
5466
5766
|
case GGML_OP_REPEAT:
|
|
5467
5767
|
case GGML_OP_GET_ROWS:
|
|
5468
5768
|
case GGML_OP_ADD:
|
|
5769
|
+
case GGML_OP_ACC:
|
|
5469
5770
|
case GGML_OP_MUL:
|
|
5470
5771
|
case GGML_OP_DIV:
|
|
5772
|
+
case GGML_OP_CONCAT:
|
|
5773
|
+
case GGML_OP_UPSCALE:
|
|
5471
5774
|
case GGML_OP_SCALE:
|
|
5472
5775
|
case GGML_OP_SQR:
|
|
5776
|
+
case GGML_OP_SIN:
|
|
5777
|
+
case GGML_OP_COS:
|
|
5473
5778
|
case GGML_OP_CLAMP:
|
|
5779
|
+
case GGML_OP_PAD:
|
|
5474
5780
|
case GGML_OP_CPY:
|
|
5475
5781
|
case GGML_OP_CONT:
|
|
5476
5782
|
case GGML_OP_DUP:
|
|
5477
5783
|
case GGML_OP_NORM:
|
|
5784
|
+
case GGML_OP_GROUP_NORM:
|
|
5478
5785
|
case GGML_OP_RMS_NORM:
|
|
5479
5786
|
case GGML_OP_DIAG_MASK_INF:
|
|
5480
5787
|
case GGML_OP_SOFT_MAX:
|
|
@@ -5483,138 +5790,221 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5483
5790
|
case GGML_OP_MUL_MAT_ID:
|
|
5484
5791
|
case GGML_OP_ARGSORT:
|
|
5485
5792
|
case GGML_OP_SUM_ROWS:
|
|
5793
|
+
case GGML_OP_IM2COL:
|
|
5794
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
5795
|
+
case GGML_OP_LEAKY_RELU:
|
|
5486
5796
|
break;
|
|
5487
5797
|
default:
|
|
5488
5798
|
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
|
5489
5799
|
GGML_ABORT("fatal error");
|
|
5490
|
-
return;
|
|
5800
|
+
return false;
|
|
5491
5801
|
}
|
|
5492
5802
|
|
|
5493
|
-
|
|
5494
|
-
|
|
5495
|
-
|
|
5803
|
+
vk_context compute_ctx;
|
|
5804
|
+
|
|
5805
|
+
if (!dryrun) {
|
|
5806
|
+
if (ctx->compute_ctx.expired()) {
|
|
5807
|
+
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
|
5808
|
+
ctx->compute_ctx = compute_ctx;
|
|
5809
|
+
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
|
5810
|
+
} else {
|
|
5811
|
+
compute_ctx = ctx->compute_ctx.lock();
|
|
5812
|
+
}
|
|
5496
5813
|
}
|
|
5497
5814
|
|
|
5498
5815
|
switch (node->op) {
|
|
5499
5816
|
case GGML_OP_REPEAT:
|
|
5500
|
-
ggml_vk_repeat(ctx,
|
|
5817
|
+
ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun);
|
|
5818
|
+
|
|
5819
|
+
break;
|
|
5820
|
+
case GGML_OP_ACC:
|
|
5821
|
+
ggml_vk_acc(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
5501
5822
|
|
|
5502
5823
|
break;
|
|
5503
5824
|
case GGML_OP_GET_ROWS:
|
|
5504
|
-
ggml_vk_get_rows(ctx,
|
|
5825
|
+
ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
5505
5826
|
|
|
5506
5827
|
break;
|
|
5507
5828
|
case GGML_OP_ADD:
|
|
5508
|
-
ggml_vk_add(ctx,
|
|
5829
|
+
ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
5509
5830
|
|
|
5510
5831
|
break;
|
|
5511
5832
|
case GGML_OP_MUL:
|
|
5512
|
-
ggml_vk_mul(ctx,
|
|
5833
|
+
ggml_vk_mul(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
5513
5834
|
|
|
5514
5835
|
break;
|
|
5515
5836
|
case GGML_OP_DIV:
|
|
5516
|
-
ggml_vk_div(ctx,
|
|
5837
|
+
ggml_vk_div(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
5838
|
+
|
|
5839
|
+
break;
|
|
5840
|
+
case GGML_OP_CONCAT:
|
|
5841
|
+
ggml_vk_concat(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
5842
|
+
|
|
5843
|
+
break;
|
|
5844
|
+
case GGML_OP_UPSCALE:
|
|
5845
|
+
ggml_vk_upscale(ctx, compute_ctx, src0, node, dryrun);
|
|
5517
5846
|
|
|
5518
5847
|
break;
|
|
5519
5848
|
case GGML_OP_SCALE:
|
|
5520
|
-
ggml_vk_scale(ctx,
|
|
5849
|
+
ggml_vk_scale(ctx, compute_ctx, src0, node, dryrun);
|
|
5521
5850
|
|
|
5522
5851
|
break;
|
|
5523
5852
|
case GGML_OP_SQR:
|
|
5524
|
-
ggml_vk_sqr(ctx,
|
|
5853
|
+
ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun);
|
|
5854
|
+
|
|
5855
|
+
break;
|
|
5856
|
+
case GGML_OP_SIN:
|
|
5857
|
+
ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
|
|
5858
|
+
|
|
5859
|
+
break;
|
|
5860
|
+
case GGML_OP_COS:
|
|
5861
|
+
ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
|
|
5525
5862
|
|
|
5526
5863
|
break;
|
|
5527
5864
|
case GGML_OP_CLAMP:
|
|
5528
|
-
ggml_vk_clamp(ctx,
|
|
5865
|
+
ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
|
|
5866
|
+
|
|
5867
|
+
break;
|
|
5868
|
+
case GGML_OP_PAD:
|
|
5869
|
+
ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
|
|
5529
5870
|
|
|
5530
5871
|
break;
|
|
5531
5872
|
case GGML_OP_CPY:
|
|
5532
5873
|
case GGML_OP_CONT:
|
|
5533
5874
|
case GGML_OP_DUP:
|
|
5534
|
-
ggml_vk_cpy(ctx,
|
|
5875
|
+
ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun);
|
|
5535
5876
|
|
|
5536
5877
|
break;
|
|
5537
5878
|
case GGML_OP_NORM:
|
|
5538
|
-
ggml_vk_norm(ctx,
|
|
5879
|
+
ggml_vk_norm(ctx, compute_ctx, src0, node, dryrun);
|
|
5880
|
+
|
|
5881
|
+
break;
|
|
5882
|
+
case GGML_OP_GROUP_NORM:
|
|
5883
|
+
ggml_vk_group_norm(ctx, compute_ctx, src0, node, dryrun);
|
|
5539
5884
|
|
|
5540
5885
|
break;
|
|
5541
5886
|
case GGML_OP_RMS_NORM:
|
|
5542
|
-
ggml_vk_rms_norm(ctx,
|
|
5887
|
+
ggml_vk_rms_norm(ctx, compute_ctx, src0, node, dryrun);
|
|
5543
5888
|
|
|
5544
5889
|
break;
|
|
5545
5890
|
case GGML_OP_UNARY:
|
|
5546
5891
|
switch (ggml_get_unary_op(node)) {
|
|
5547
5892
|
case GGML_UNARY_OP_SILU:
|
|
5548
5893
|
case GGML_UNARY_OP_GELU:
|
|
5894
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
5549
5895
|
case GGML_UNARY_OP_RELU:
|
|
5550
|
-
|
|
5896
|
+
case GGML_UNARY_OP_TANH:
|
|
5897
|
+
ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
|
|
5551
5898
|
break;
|
|
5552
5899
|
default:
|
|
5553
|
-
return;
|
|
5900
|
+
return false;
|
|
5554
5901
|
}
|
|
5555
5902
|
break;
|
|
5556
5903
|
case GGML_OP_DIAG_MASK_INF:
|
|
5557
|
-
ggml_vk_diag_mask_inf(ctx,
|
|
5904
|
+
ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node, dryrun);
|
|
5558
5905
|
|
|
5559
5906
|
break;
|
|
5560
5907
|
case GGML_OP_SOFT_MAX:
|
|
5561
|
-
ggml_vk_soft_max(ctx,
|
|
5908
|
+
ggml_vk_soft_max(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
5562
5909
|
|
|
5563
5910
|
break;
|
|
5564
5911
|
case GGML_OP_ROPE:
|
|
5565
|
-
ggml_vk_rope(ctx,
|
|
5912
|
+
ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node, dryrun);
|
|
5566
5913
|
|
|
5567
5914
|
break;
|
|
5568
5915
|
case GGML_OP_ARGSORT:
|
|
5569
|
-
ggml_vk_argsort(ctx,
|
|
5916
|
+
ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
|
|
5570
5917
|
|
|
5571
5918
|
break;
|
|
5572
5919
|
case GGML_OP_SUM_ROWS:
|
|
5573
|
-
ggml_vk_sum_rows(ctx,
|
|
5920
|
+
ggml_vk_sum_rows(ctx, compute_ctx, src0, node, dryrun);
|
|
5921
|
+
|
|
5922
|
+
break;
|
|
5923
|
+
case GGML_OP_IM2COL:
|
|
5924
|
+
ggml_vk_im2col(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
5925
|
+
|
|
5926
|
+
break;
|
|
5927
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
5928
|
+
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
|
5929
|
+
|
|
5930
|
+
break;
|
|
5931
|
+
case GGML_OP_LEAKY_RELU:
|
|
5932
|
+
ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
|
|
5574
5933
|
|
|
5575
5934
|
break;
|
|
5576
5935
|
case GGML_OP_MUL_MAT:
|
|
5577
|
-
ggml_vk_mul_mat(ctx,
|
|
5936
|
+
ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
5578
5937
|
|
|
5579
5938
|
break;
|
|
5580
5939
|
case GGML_OP_MUL_MAT_ID:
|
|
5581
|
-
ggml_vk_mul_mat_id(ctx,
|
|
5940
|
+
ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node, dryrun);
|
|
5582
5941
|
|
|
5583
5942
|
break;
|
|
5584
5943
|
default:
|
|
5585
|
-
return;
|
|
5944
|
+
return false;
|
|
5586
5945
|
}
|
|
5587
5946
|
|
|
5588
|
-
|
|
5947
|
+
if (dryrun) {
|
|
5948
|
+
return false;
|
|
5949
|
+
}
|
|
5589
5950
|
|
|
5590
|
-
|
|
5951
|
+
ctx->tensor_ctxs[node_idx] = compute_ctx;
|
|
5952
|
+
|
|
5953
|
+
#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
|
|
5591
5954
|
// Force context reset on each node so that each tensor ends up in its own context
|
|
5592
5955
|
// and can be run and compared to its CPU equivalent separately
|
|
5593
5956
|
last_node = true;
|
|
5594
5957
|
#endif
|
|
5595
5958
|
|
|
5596
|
-
if (last_node) {
|
|
5597
|
-
ggml_vk_ctx_end(
|
|
5598
|
-
|
|
5599
|
-
|
|
5959
|
+
if (submit || last_node) {
|
|
5960
|
+
ggml_vk_ctx_end(compute_ctx);
|
|
5961
|
+
|
|
5962
|
+
// TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
|
|
5963
|
+
if (last_node) {
|
|
5964
|
+
compute_ctx->exit_tensor_idx = node_idx_begin;
|
|
5965
|
+
}
|
|
5966
|
+
else {
|
|
5967
|
+
compute_ctx->exit_tensor_idx = -1;
|
|
5968
|
+
}
|
|
5969
|
+
|
|
5970
|
+
ctx->compute_ctx.reset();
|
|
5971
|
+
|
|
5972
|
+
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
|
|
5973
|
+
if (!ok) {
|
|
5974
|
+
if (node->op == GGML_OP_UNARY) {
|
|
5975
|
+
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
|
5976
|
+
}
|
|
5977
|
+
else {
|
|
5978
|
+
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
|
|
5979
|
+
}
|
|
5980
|
+
}
|
|
5981
|
+
|
|
5600
5982
|
}
|
|
5983
|
+
return true;
|
|
5601
5984
|
}
|
|
5602
5985
|
|
|
5603
|
-
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor){
|
|
5604
|
-
|
|
5986
|
+
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
|
|
5987
|
+
ggml_backend_buffer * buf = nullptr;
|
|
5605
5988
|
|
|
5606
5989
|
switch (tensor->op) {
|
|
5607
5990
|
case GGML_OP_ADD:
|
|
5991
|
+
case GGML_OP_ACC:
|
|
5608
5992
|
case GGML_OP_GET_ROWS:
|
|
5609
5993
|
case GGML_OP_MUL:
|
|
5610
5994
|
case GGML_OP_DIV:
|
|
5995
|
+
case GGML_OP_CONCAT:
|
|
5996
|
+
case GGML_OP_UPSCALE:
|
|
5611
5997
|
case GGML_OP_SCALE:
|
|
5612
5998
|
case GGML_OP_SQR:
|
|
5999
|
+
case GGML_OP_SIN:
|
|
6000
|
+
case GGML_OP_COS:
|
|
5613
6001
|
case GGML_OP_CLAMP:
|
|
6002
|
+
case GGML_OP_PAD:
|
|
5614
6003
|
case GGML_OP_CPY:
|
|
5615
6004
|
case GGML_OP_CONT:
|
|
5616
6005
|
case GGML_OP_DUP:
|
|
5617
6006
|
case GGML_OP_NORM:
|
|
6007
|
+
case GGML_OP_GROUP_NORM:
|
|
5618
6008
|
case GGML_OP_RMS_NORM:
|
|
5619
6009
|
case GGML_OP_DIAG_MASK_INF:
|
|
5620
6010
|
case GGML_OP_SOFT_MAX:
|
|
@@ -5626,15 +6016,21 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
5626
6016
|
case GGML_OP_NONE:
|
|
5627
6017
|
case GGML_OP_ARGSORT:
|
|
5628
6018
|
case GGML_OP_SUM_ROWS:
|
|
5629
|
-
|
|
6019
|
+
case GGML_OP_IM2COL:
|
|
6020
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
6021
|
+
case GGML_OP_LEAKY_RELU:
|
|
6022
|
+
case GGML_OP_REPEAT:
|
|
6023
|
+
buf = tensor->buffer;
|
|
5630
6024
|
|
|
5631
6025
|
break;
|
|
5632
6026
|
case GGML_OP_UNARY:
|
|
5633
6027
|
switch (ggml_get_unary_op(tensor)) {
|
|
5634
6028
|
case GGML_UNARY_OP_SILU:
|
|
5635
6029
|
case GGML_UNARY_OP_GELU:
|
|
6030
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
5636
6031
|
case GGML_UNARY_OP_RELU:
|
|
5637
|
-
|
|
6032
|
+
case GGML_UNARY_OP_TANH:
|
|
6033
|
+
buf = tensor->buffer;
|
|
5638
6034
|
break;
|
|
5639
6035
|
default:
|
|
5640
6036
|
return false;
|
|
@@ -5642,45 +6038,57 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
5642
6038
|
break;
|
|
5643
6039
|
case GGML_OP_MUL_MAT:
|
|
5644
6040
|
case GGML_OP_MUL_MAT_ID:
|
|
5645
|
-
|
|
6041
|
+
buf = tensor->buffer;
|
|
5646
6042
|
|
|
5647
6043
|
break;
|
|
5648
6044
|
default:
|
|
5649
6045
|
return false;
|
|
5650
6046
|
}
|
|
5651
6047
|
|
|
5652
|
-
if (
|
|
6048
|
+
if (buf == nullptr) {
|
|
5653
6049
|
return false;
|
|
5654
6050
|
}
|
|
5655
6051
|
|
|
5656
6052
|
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
|
|
5657
6053
|
|
|
5658
|
-
|
|
5659
|
-
ggml_vk_check_results_0(ctx, tensor);
|
|
5660
|
-
#endif
|
|
6054
|
+
vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
|
|
5661
6055
|
|
|
5662
|
-
|
|
6056
|
+
// always wait for the GPU work to be done for the last submit
|
|
6057
|
+
if (tensor_idx == subctx->exit_tensor_idx) {
|
|
6058
|
+
use_fence = true;
|
|
6059
|
+
}
|
|
5663
6060
|
|
|
5664
6061
|
// Only run if ctx hasn't been submitted yet
|
|
5665
|
-
if (!subctx
|
|
6062
|
+
if (!subctx->seqs.empty()) {
|
|
6063
|
+
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
6064
|
+
ggml_vk_check_results_0(tensor);
|
|
6065
|
+
use_fence = true;
|
|
6066
|
+
#endif
|
|
6067
|
+
|
|
5666
6068
|
// Do staging buffer copies
|
|
5667
|
-
for (auto& cpy : subctx
|
|
6069
|
+
for (auto& cpy : subctx->in_memcpys) {
|
|
5668
6070
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
|
5669
6071
|
}
|
|
5670
6072
|
|
|
5671
|
-
ggml_vk_submit(
|
|
5672
|
-
|
|
6073
|
+
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
|
|
6074
|
+
|
|
6075
|
+
if (use_fence) {
|
|
6076
|
+
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
|
5673
6077
|
|
|
5674
|
-
|
|
5675
|
-
|
|
5676
|
-
|
|
6078
|
+
ctx->device->device.resetFences({ ctx->fence });
|
|
6079
|
+
}
|
|
6080
|
+
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
6081
|
+
ggml_vk_check_results_1(tensor);
|
|
6082
|
+
#endif
|
|
6083
|
+
}
|
|
5677
6084
|
|
|
6085
|
+
if (tensor_idx == subctx->exit_tensor_idx) {
|
|
5678
6086
|
// Do staging buffer copies
|
|
5679
|
-
for (auto& cpy : subctx
|
|
6087
|
+
for (auto& cpy : subctx->out_memcpys) {
|
|
5680
6088
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
|
5681
6089
|
}
|
|
5682
|
-
subctx
|
|
5683
|
-
subctx
|
|
6090
|
+
subctx->in_memcpys.clear();
|
|
6091
|
+
subctx->out_memcpys.clear();
|
|
5684
6092
|
}
|
|
5685
6093
|
|
|
5686
6094
|
return true;
|
|
@@ -5694,12 +6102,14 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
5694
6102
|
}
|
|
5695
6103
|
ctx->gc.temp_buffers.clear();
|
|
5696
6104
|
|
|
5697
|
-
for (auto&
|
|
5698
|
-
|
|
6105
|
+
for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) {
|
|
6106
|
+
vk_pipeline_ref plr = ctx->device->pipelines[dsr.first];
|
|
6107
|
+
|
|
6108
|
+
if (plr.expired()) {
|
|
5699
6109
|
continue;
|
|
5700
6110
|
}
|
|
5701
6111
|
|
|
5702
|
-
vk_pipeline pl =
|
|
6112
|
+
vk_pipeline pl = plr.lock();
|
|
5703
6113
|
ggml_pipeline_cleanup(pl);
|
|
5704
6114
|
}
|
|
5705
6115
|
|
|
@@ -5723,11 +6133,9 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
5723
6133
|
ctx->device->device.resetEvent(event);
|
|
5724
6134
|
}
|
|
5725
6135
|
|
|
5726
|
-
ctx->
|
|
5727
|
-
|
|
5728
|
-
ctx->compute_ctx = nullptr;
|
|
5729
|
-
ctx->transfer_ctx = nullptr;
|
|
6136
|
+
ctx->tensor_ctxs.clear();
|
|
5730
6137
|
ctx->gc.contexts.clear();
|
|
6138
|
+
ctx->device->pipeline_descriptor_set_requirements.clear();
|
|
5731
6139
|
}
|
|
5732
6140
|
|
|
5733
6141
|
// Clean up on backend free
|
|
@@ -5738,7 +6146,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
5738
6146
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
|
5739
6147
|
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
|
5740
6148
|
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
|
5741
|
-
ggml_vk_destroy_buffer(ctx->staging);
|
|
5742
6149
|
|
|
5743
6150
|
for (auto& buffer : ctx->buffer_pool) {
|
|
5744
6151
|
ggml_vk_destroy_buffer(buffer);
|
|
@@ -5747,7 +6154,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
5747
6154
|
ctx->prealloc_size_x = 0;
|
|
5748
6155
|
ctx->prealloc_size_y = 0;
|
|
5749
6156
|
ctx->prealloc_size_split_k = 0;
|
|
5750
|
-
ctx->staging_size = 0;
|
|
5751
6157
|
|
|
5752
6158
|
for (auto& event : ctx->gc.events) {
|
|
5753
6159
|
ctx->device->device.destroyEvent(event);
|
|
@@ -5757,13 +6163,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
5757
6163
|
ctx->device->device.destroyFence(ctx->fence);
|
|
5758
6164
|
}
|
|
5759
6165
|
|
|
5760
|
-
|
|
6166
|
+
static int ggml_vk_get_device_count() {
|
|
5761
6167
|
ggml_vk_instance_init();
|
|
5762
6168
|
|
|
5763
6169
|
return vk_instance.device_indices.size();
|
|
5764
6170
|
}
|
|
5765
6171
|
|
|
5766
|
-
|
|
6172
|
+
static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
|
|
5767
6173
|
ggml_vk_instance_init();
|
|
5768
6174
|
|
|
5769
6175
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
|
@@ -5780,111 +6186,61 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
|
|
|
5780
6186
|
|
|
5781
6187
|
// device backend
|
|
5782
6188
|
|
|
5783
|
-
static
|
|
5784
|
-
|
|
5785
|
-
struct ggml_backend_vk_buffer_context {
|
|
5786
|
-
vk_device_ref device;
|
|
5787
|
-
vk_buffer dev_buffer;
|
|
5788
|
-
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
|
5789
|
-
size_t temp_tensor_extra_index = 0;
|
|
5790
|
-
std::string name;
|
|
5791
|
-
|
|
5792
|
-
ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) :
|
|
5793
|
-
device(device),
|
|
5794
|
-
dev_buffer(dev_buffer),
|
|
5795
|
-
name(name) {
|
|
5796
|
-
}
|
|
5797
|
-
|
|
5798
|
-
~ggml_backend_vk_buffer_context() {
|
|
5799
|
-
ggml_vk_destroy_buffer(dev_buffer);
|
|
5800
|
-
if (temp_tensor_extras != nullptr) {
|
|
5801
|
-
delete[] temp_tensor_extras;
|
|
5802
|
-
}
|
|
5803
|
-
}
|
|
5804
|
-
|
|
5805
|
-
ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
|
|
5806
|
-
if (temp_tensor_extras == nullptr) {
|
|
5807
|
-
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_VK_MAX_NODES];
|
|
5808
|
-
}
|
|
5809
|
-
|
|
5810
|
-
size_t alloc_index = temp_tensor_extra_index;
|
|
5811
|
-
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES;
|
|
5812
|
-
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
|
5813
|
-
extra->reset();
|
|
5814
|
-
|
|
5815
|
-
return extra;
|
|
5816
|
-
}
|
|
5817
|
-
};
|
|
5818
|
-
|
|
5819
|
-
GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
6189
|
+
static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
5820
6190
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
5821
6191
|
return ctx->name.c_str();
|
|
5822
6192
|
}
|
|
5823
6193
|
|
|
5824
|
-
|
|
6194
|
+
static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
|
5825
6195
|
return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
|
|
5826
6196
|
}
|
|
5827
6197
|
|
|
5828
|
-
|
|
6198
|
+
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
5829
6199
|
VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
|
|
5830
6200
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
5831
6201
|
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
|
5832
6202
|
delete ctx;
|
|
5833
6203
|
}
|
|
5834
6204
|
|
|
5835
|
-
|
|
6205
|
+
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
5836
6206
|
return vk_ptr_base;
|
|
5837
6207
|
|
|
5838
6208
|
UNUSED(buffer);
|
|
5839
6209
|
}
|
|
5840
6210
|
|
|
5841
|
-
|
|
6211
|
+
static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
5842
6212
|
VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
|
|
5843
|
-
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
5844
|
-
|
|
5845
6213
|
if (tensor->view_src != nullptr) {
|
|
5846
6214
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
|
5847
|
-
GGML_ASSERT(tensor->view_src->extra != nullptr);
|
|
5848
|
-
tensor->extra = tensor->view_src->extra;
|
|
5849
|
-
} else {
|
|
5850
|
-
ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
|
|
5851
|
-
extra->buffer_gpu = ctx->dev_buffer;
|
|
5852
|
-
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
|
5853
|
-
tensor->extra = extra;
|
|
5854
6215
|
}
|
|
5855
6216
|
}
|
|
5856
6217
|
|
|
5857
|
-
|
|
6218
|
+
static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
5858
6219
|
VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
|
5859
|
-
|
|
5860
|
-
|
|
5861
|
-
vk_buffer buf = extra->buffer_gpu.lock();
|
|
5862
|
-
|
|
5863
|
-
ggml_vk_buffer_write(buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6220
|
+
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
6221
|
+
vk_buffer buf = buf_ctx->dev_buffer;
|
|
5864
6222
|
|
|
5865
|
-
|
|
6223
|
+
ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
|
5866
6224
|
}
|
|
5867
6225
|
|
|
5868
|
-
|
|
6226
|
+
static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
5869
6227
|
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
|
5870
|
-
|
|
6228
|
+
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
5871
6229
|
|
|
5872
|
-
vk_buffer buf =
|
|
6230
|
+
vk_buffer buf = buf_ctx->dev_buffer;
|
|
5873
6231
|
|
|
5874
|
-
ggml_vk_buffer_read(buf,
|
|
5875
|
-
|
|
5876
|
-
GGML_UNUSED(buffer);
|
|
6232
|
+
ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
|
5877
6233
|
}
|
|
5878
6234
|
|
|
5879
|
-
|
|
6235
|
+
static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
|
5880
6236
|
if (ggml_backend_buffer_is_vk(src->buffer)) {
|
|
5881
|
-
|
|
5882
|
-
|
|
6237
|
+
ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
|
|
6238
|
+
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
5883
6239
|
|
|
5884
|
-
vk_buffer src_buf =
|
|
5885
|
-
vk_buffer dst_buf =
|
|
6240
|
+
vk_buffer src_buf = src_buf_ctx->dev_buffer;
|
|
6241
|
+
vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
|
|
5886
6242
|
|
|
5887
|
-
ggml_vk_buffer_copy(dst_buf,
|
|
6243
|
+
ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
|
|
5888
6244
|
|
|
5889
6245
|
return true;
|
|
5890
6246
|
}
|
|
@@ -5893,7 +6249,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|
|
5893
6249
|
UNUSED(buffer);
|
|
5894
6250
|
}
|
|
5895
6251
|
|
|
5896
|
-
|
|
6252
|
+
static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
5897
6253
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
5898
6254
|
|
|
5899
6255
|
ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
|
|
@@ -5904,6 +6260,7 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
|
|
5904
6260
|
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
|
|
5905
6261
|
/* .get_base = */ ggml_backend_vk_buffer_get_base,
|
|
5906
6262
|
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
|
|
6263
|
+
/* .memset_tensor = */ NULL,
|
|
5907
6264
|
/* .set_tensor = */ ggml_backend_vk_buffer_set_tensor,
|
|
5908
6265
|
/* .get_tensor = */ ggml_backend_vk_buffer_get_tensor,
|
|
5909
6266
|
/* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor,
|
|
@@ -5912,13 +6269,13 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
|
|
5912
6269
|
};
|
|
5913
6270
|
|
|
5914
6271
|
// vk buffer type
|
|
5915
|
-
|
|
6272
|
+
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
5916
6273
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
|
5917
6274
|
|
|
5918
6275
|
return ctx->name.c_str();
|
|
5919
6276
|
}
|
|
5920
6277
|
|
|
5921
|
-
|
|
6278
|
+
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
5922
6279
|
VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
|
|
5923
6280
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
|
5924
6281
|
|
|
@@ -5934,23 +6291,23 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
|
|
5934
6291
|
return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size);
|
|
5935
6292
|
}
|
|
5936
6293
|
|
|
5937
|
-
|
|
6294
|
+
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
5938
6295
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
|
5939
6296
|
return ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
|
5940
6297
|
}
|
|
5941
6298
|
|
|
5942
|
-
|
|
6299
|
+
static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
|
5943
6300
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
|
5944
6301
|
return ctx->device->max_memory_allocation_size;
|
|
5945
6302
|
}
|
|
5946
6303
|
|
|
5947
|
-
|
|
6304
|
+
static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
|
5948
6305
|
return ggml_nbytes(tensor);
|
|
5949
6306
|
|
|
5950
6307
|
UNUSED(buft);
|
|
5951
6308
|
}
|
|
5952
6309
|
|
|
5953
|
-
|
|
6310
|
+
ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
|
5954
6311
|
ggml_vk_instance_init();
|
|
5955
6312
|
|
|
5956
6313
|
VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
|
|
@@ -5962,24 +6319,24 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num)
|
|
|
5962
6319
|
|
|
5963
6320
|
// host buffer type
|
|
5964
6321
|
|
|
5965
|
-
|
|
6322
|
+
static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
5966
6323
|
return GGML_VK_NAME "_Host";
|
|
5967
6324
|
|
|
5968
6325
|
UNUSED(buft);
|
|
5969
6326
|
}
|
|
5970
6327
|
|
|
5971
|
-
|
|
6328
|
+
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) {
|
|
5972
6329
|
return GGML_VK_NAME "_Host";
|
|
5973
6330
|
|
|
5974
6331
|
UNUSED(buffer);
|
|
5975
6332
|
}
|
|
5976
6333
|
|
|
5977
|
-
|
|
6334
|
+
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
5978
6335
|
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
|
5979
6336
|
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
|
5980
6337
|
}
|
|
5981
6338
|
|
|
5982
|
-
|
|
6339
|
+
static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
5983
6340
|
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
|
|
5984
6341
|
|
|
5985
6342
|
size += 32; // Behave like the CPU buffer type
|
|
@@ -6003,7 +6360,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
|
|
|
6003
6360
|
UNUSED(buft);
|
|
6004
6361
|
}
|
|
6005
6362
|
|
|
6006
|
-
|
|
6363
|
+
static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
6007
6364
|
return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment;
|
|
6008
6365
|
|
|
6009
6366
|
UNUSED(buft);
|
|
@@ -6011,7 +6368,7 @@ GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_back
|
|
|
6011
6368
|
|
|
6012
6369
|
// Should be changed to return device-specific host buffer type
|
|
6013
6370
|
// but that probably requires changes in llama.cpp
|
|
6014
|
-
|
|
6371
|
+
ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
6015
6372
|
static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
|
|
6016
6373
|
/* .iface = */ {
|
|
6017
6374
|
/* .get_name = */ ggml_backend_vk_host_buffer_type_name,
|
|
@@ -6021,6 +6378,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
|
6021
6378
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
6022
6379
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
6023
6380
|
},
|
|
6381
|
+
/* .device = */ nullptr,
|
|
6024
6382
|
/* .context = */ nullptr,
|
|
6025
6383
|
};
|
|
6026
6384
|
|
|
@@ -6034,13 +6392,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
|
6034
6392
|
|
|
6035
6393
|
// backend
|
|
6036
6394
|
|
|
6037
|
-
|
|
6395
|
+
static const char * ggml_backend_vk_name(ggml_backend_t backend) {
|
|
6038
6396
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6039
6397
|
|
|
6040
6398
|
return ctx->name.c_str();
|
|
6041
6399
|
}
|
|
6042
6400
|
|
|
6043
|
-
|
|
6401
|
+
static void ggml_backend_vk_free(ggml_backend_t backend) {
|
|
6044
6402
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6045
6403
|
VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
|
|
6046
6404
|
|
|
@@ -6050,107 +6408,125 @@ GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
|
|
|
6050
6408
|
delete backend;
|
|
6051
6409
|
}
|
|
6052
6410
|
|
|
6053
|
-
|
|
6411
|
+
static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) {
|
|
6054
6412
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6055
6413
|
|
|
6056
6414
|
return &ctx->device->buffer_type;
|
|
6057
6415
|
}
|
|
6058
6416
|
|
|
6059
|
-
|
|
6417
|
+
static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
6060
6418
|
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
|
6061
6419
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6062
6420
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
6063
6421
|
|
|
6064
|
-
|
|
6422
|
+
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
|
6423
|
+
|
|
6424
|
+
vk_context transfer_ctx;
|
|
6065
6425
|
|
|
6066
|
-
if (ctx->transfer_ctx
|
|
6426
|
+
if (ctx->transfer_ctx.expired()) {
|
|
6067
6427
|
// Initialize new transfer context
|
|
6068
|
-
|
|
6069
|
-
|
|
6428
|
+
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
|
6429
|
+
ctx->transfer_ctx = transfer_ctx;
|
|
6430
|
+
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
|
6431
|
+
} else {
|
|
6432
|
+
transfer_ctx = ctx->transfer_ctx.lock();
|
|
6070
6433
|
}
|
|
6071
6434
|
|
|
6072
|
-
vk_buffer buf =
|
|
6435
|
+
vk_buffer buf = buf_ctx->dev_buffer;
|
|
6073
6436
|
|
|
6074
|
-
ggml_vk_buffer_write_async(
|
|
6437
|
+
ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
|
6075
6438
|
}
|
|
6076
6439
|
|
|
6077
|
-
|
|
6440
|
+
static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
6078
6441
|
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
|
6079
6442
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6080
6443
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
6081
6444
|
|
|
6082
|
-
|
|
6445
|
+
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
|
6446
|
+
|
|
6447
|
+
vk_context transfer_ctx;
|
|
6083
6448
|
|
|
6084
|
-
if (ctx->transfer_ctx
|
|
6449
|
+
if (ctx->transfer_ctx.expired()) {
|
|
6085
6450
|
// Initialize new transfer context
|
|
6086
|
-
|
|
6087
|
-
|
|
6451
|
+
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
|
6452
|
+
ctx->transfer_ctx = transfer_ctx;
|
|
6453
|
+
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
|
6454
|
+
} else {
|
|
6455
|
+
transfer_ctx = ctx->transfer_ctx.lock();
|
|
6088
6456
|
}
|
|
6089
6457
|
|
|
6090
|
-
vk_buffer buf =
|
|
6458
|
+
vk_buffer buf = buf_ctx->dev_buffer;
|
|
6091
6459
|
|
|
6092
|
-
ggml_vk_buffer_read_async(
|
|
6460
|
+
ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
|
6093
6461
|
}
|
|
6094
6462
|
|
|
6095
|
-
|
|
6463
|
+
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
|
6096
6464
|
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
|
|
6097
6465
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6098
6466
|
if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
|
6099
|
-
|
|
6100
|
-
|
|
6467
|
+
ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
|
|
6468
|
+
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
|
6469
|
+
|
|
6470
|
+
vk_context transfer_ctx;
|
|
6101
6471
|
|
|
6102
|
-
if (ctx->transfer_ctx
|
|
6472
|
+
if (ctx->transfer_ctx.expired()) {
|
|
6103
6473
|
// Initialize new transfer context
|
|
6104
|
-
|
|
6105
|
-
|
|
6474
|
+
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
|
6475
|
+
ctx->transfer_ctx = transfer_ctx;
|
|
6476
|
+
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
|
6477
|
+
} else {
|
|
6478
|
+
transfer_ctx = ctx->transfer_ctx.lock();
|
|
6106
6479
|
}
|
|
6107
6480
|
|
|
6108
|
-
vk_buffer src_buf =
|
|
6109
|
-
vk_buffer dst_buf =
|
|
6481
|
+
vk_buffer src_buf = src_buf_ctx->dev_buffer;
|
|
6482
|
+
vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
|
|
6110
6483
|
|
|
6111
|
-
ggml_vk_buffer_copy_async(
|
|
6484
|
+
ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
|
|
6112
6485
|
return true;
|
|
6113
6486
|
}
|
|
6114
6487
|
|
|
6115
6488
|
return false;
|
|
6116
6489
|
}
|
|
6117
6490
|
|
|
6118
|
-
|
|
6491
|
+
static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
6119
6492
|
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
|
|
6120
6493
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6121
|
-
if(ctx->transfer_ctx
|
|
6494
|
+
if(ctx->transfer_ctx.expired()) {
|
|
6122
6495
|
return;
|
|
6123
6496
|
}
|
|
6124
6497
|
|
|
6125
|
-
|
|
6498
|
+
vk_context transfer_ctx = ctx->transfer_ctx.lock();
|
|
6499
|
+
|
|
6500
|
+
ggml_vk_ctx_end(transfer_ctx);
|
|
6126
6501
|
|
|
6127
|
-
for (auto& cpy :
|
|
6502
|
+
for (auto& cpy : transfer_ctx->in_memcpys) {
|
|
6128
6503
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
|
6129
6504
|
}
|
|
6130
6505
|
|
|
6131
|
-
ggml_vk_submit(
|
|
6506
|
+
ggml_vk_submit(transfer_ctx, ctx->fence);
|
|
6132
6507
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
|
|
6133
6508
|
ctx->device->device.resetFences({ ctx->fence });
|
|
6134
6509
|
|
|
6135
|
-
for (auto& cpy :
|
|
6510
|
+
for (auto& cpy : transfer_ctx->out_memcpys) {
|
|
6136
6511
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
|
6137
6512
|
}
|
|
6138
6513
|
|
|
6139
|
-
ctx->transfer_ctx
|
|
6514
|
+
ctx->transfer_ctx.reset();
|
|
6140
6515
|
}
|
|
6141
6516
|
|
|
6142
6517
|
static bool ggml_vk_is_empty(ggml_tensor * node) {
|
|
6143
6518
|
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
|
6144
6519
|
}
|
|
6145
6520
|
|
|
6146
|
-
|
|
6521
|
+
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
6147
6522
|
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
|
6148
6523
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6149
6524
|
|
|
6150
6525
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
6151
|
-
|
|
6526
|
+
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
|
|
6152
6527
|
}
|
|
6153
6528
|
ggml_vk_preallocate_buffers(ctx);
|
|
6529
|
+
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
|
6154
6530
|
|
|
6155
6531
|
int last_node = cgraph->n_nodes - 1;
|
|
6156
6532
|
|
|
@@ -6159,29 +6535,45 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
|
6159
6535
|
last_node -= 1;
|
|
6160
6536
|
}
|
|
6161
6537
|
|
|
6162
|
-
|
|
6163
|
-
|
|
6164
|
-
}
|
|
6538
|
+
// Reserve tensor context space for all nodes
|
|
6539
|
+
ctx->tensor_ctxs.resize(cgraph->n_nodes);
|
|
6165
6540
|
|
|
6166
|
-
|
|
6167
|
-
|
|
6541
|
+
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
|
6542
|
+
int submit_node_idx = 0; // index to first node in a batch
|
|
6168
6543
|
|
|
6169
|
-
|
|
6170
|
-
|
|
6544
|
+
// submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
|
|
6545
|
+
constexpr int submit_count = 100;
|
|
6546
|
+
int submitted_nodes = 0;
|
|
6547
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
6548
|
+
if (first_node_in_batch) {
|
|
6549
|
+
submit_node_idx = i;
|
|
6171
6550
|
}
|
|
6172
6551
|
|
|
6173
|
-
bool
|
|
6174
|
-
|
|
6175
|
-
|
|
6552
|
+
bool submit = (submitted_nodes >= submit_count) || (i == last_node);
|
|
6553
|
+
|
|
6554
|
+
|
|
6555
|
+
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
|
|
6556
|
+
|
|
6557
|
+
if (enqueued) {
|
|
6558
|
+
++submitted_nodes;
|
|
6559
|
+
|
|
6560
|
+
#ifndef GGML_VULKAN_CHECK_RESULTS
|
|
6561
|
+
if (first_node_in_batch) {
|
|
6562
|
+
first_node_in_batch = false;
|
|
6563
|
+
}
|
|
6564
|
+
#endif
|
|
6176
6565
|
}
|
|
6177
|
-
|
|
6178
|
-
|
|
6179
|
-
|
|
6566
|
+
|
|
6567
|
+
if (submit) {
|
|
6568
|
+
first_node_in_batch = true;
|
|
6569
|
+
submitted_nodes = 0;
|
|
6180
6570
|
}
|
|
6181
|
-
#endif
|
|
6182
|
-
GGML_ASSERT(ok);
|
|
6183
6571
|
}
|
|
6184
6572
|
|
|
6573
|
+
#ifdef GGML_VULKAN_PERF
|
|
6574
|
+
ctx->device->perf_logger->print_timings();
|
|
6575
|
+
#endif
|
|
6576
|
+
|
|
6185
6577
|
ggml_vk_graph_cleanup(ctx);
|
|
6186
6578
|
|
|
6187
6579
|
return GGML_STATUS_SUCCESS;
|
|
@@ -6189,15 +6581,17 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
|
6189
6581
|
UNUSED(backend);
|
|
6190
6582
|
}
|
|
6191
6583
|
|
|
6192
|
-
|
|
6584
|
+
static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
6193
6585
|
// ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
|
|
6194
6586
|
|
|
6195
6587
|
switch (op->op) {
|
|
6196
6588
|
case GGML_OP_UNARY:
|
|
6197
6589
|
switch (ggml_get_unary_op(op)) {
|
|
6198
6590
|
case GGML_UNARY_OP_GELU:
|
|
6591
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
6199
6592
|
case GGML_UNARY_OP_SILU:
|
|
6200
6593
|
case GGML_UNARY_OP_RELU:
|
|
6594
|
+
case GGML_UNARY_OP_TANH:
|
|
6201
6595
|
return ggml_is_contiguous(op->src[0]);
|
|
6202
6596
|
default:
|
|
6203
6597
|
return false;
|
|
@@ -6254,6 +6648,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6254
6648
|
return false;
|
|
6255
6649
|
}
|
|
6256
6650
|
} break;
|
|
6651
|
+
case GGML_OP_CONT:
|
|
6257
6652
|
case GGML_OP_CPY:
|
|
6258
6653
|
case GGML_OP_DUP:
|
|
6259
6654
|
{
|
|
@@ -6270,11 +6665,8 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6270
6665
|
}
|
|
6271
6666
|
return false;
|
|
6272
6667
|
} break;
|
|
6273
|
-
|
|
6274
|
-
|
|
6275
|
-
// ggml_type src0_type = op->src[0]->type;
|
|
6276
|
-
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
|
6277
|
-
// } break;
|
|
6668
|
+
case GGML_OP_REPEAT:
|
|
6669
|
+
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
|
|
6278
6670
|
case GGML_OP_ROPE:
|
|
6279
6671
|
return ggml_is_contiguous(op->src[0]);
|
|
6280
6672
|
case GGML_OP_NONE:
|
|
@@ -6283,18 +6675,27 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6283
6675
|
case GGML_OP_PERMUTE:
|
|
6284
6676
|
case GGML_OP_TRANSPOSE:
|
|
6285
6677
|
case GGML_OP_NORM:
|
|
6678
|
+
case GGML_OP_GROUP_NORM:
|
|
6679
|
+
case GGML_OP_RMS_NORM:
|
|
6286
6680
|
case GGML_OP_ADD:
|
|
6681
|
+
case GGML_OP_ACC:
|
|
6287
6682
|
case GGML_OP_MUL:
|
|
6288
6683
|
case GGML_OP_DIV:
|
|
6289
|
-
case
|
|
6684
|
+
case GGML_OP_CONCAT:
|
|
6685
|
+
case GGML_OP_UPSCALE:
|
|
6290
6686
|
case GGML_OP_SCALE:
|
|
6291
6687
|
case GGML_OP_SQR:
|
|
6688
|
+
case GGML_OP_SIN:
|
|
6689
|
+
case GGML_OP_COS:
|
|
6292
6690
|
case GGML_OP_CLAMP:
|
|
6293
|
-
case
|
|
6691
|
+
case GGML_OP_PAD:
|
|
6294
6692
|
case GGML_OP_DIAG_MASK_INF:
|
|
6295
6693
|
case GGML_OP_SOFT_MAX:
|
|
6296
6694
|
case GGML_OP_ARGSORT:
|
|
6297
6695
|
case GGML_OP_SUM_ROWS:
|
|
6696
|
+
case GGML_OP_IM2COL:
|
|
6697
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
6698
|
+
case GGML_OP_LEAKY_RELU:
|
|
6298
6699
|
return true;
|
|
6299
6700
|
default:
|
|
6300
6701
|
return false;
|
|
@@ -6303,7 +6704,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6303
6704
|
UNUSED(backend);
|
|
6304
6705
|
}
|
|
6305
6706
|
|
|
6306
|
-
|
|
6707
|
+
static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
6307
6708
|
const int min_batch_size = 32;
|
|
6308
6709
|
|
|
6309
6710
|
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
|
@@ -6312,7 +6713,7 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
|
|
|
6312
6713
|
UNUSED(backend);
|
|
6313
6714
|
}
|
|
6314
6715
|
|
|
6315
|
-
|
|
6716
|
+
static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
6316
6717
|
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
|
6317
6718
|
return false;
|
|
6318
6719
|
}
|
|
@@ -6340,11 +6741,8 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
|
6340
6741
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
|
6341
6742
|
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
|
6342
6743
|
/* .offload_op = */ ggml_backend_vk_offload_op,
|
|
6343
|
-
/* .event_new = */ NULL,
|
|
6344
|
-
/* .event_free = */ NULL,
|
|
6345
6744
|
/* .event_record = */ NULL,
|
|
6346
6745
|
/* .event_wait = */ NULL,
|
|
6347
|
-
/* .event_synchronize = */ NULL,
|
|
6348
6746
|
};
|
|
6349
6747
|
|
|
6350
6748
|
static ggml_guid_t ggml_backend_vk_guid() {
|
|
@@ -6352,7 +6750,7 @@ static ggml_guid_t ggml_backend_vk_guid() {
|
|
|
6352
6750
|
return &guid;
|
|
6353
6751
|
}
|
|
6354
6752
|
|
|
6355
|
-
|
|
6753
|
+
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
|
6356
6754
|
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
|
6357
6755
|
|
|
6358
6756
|
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
|
@@ -6361,25 +6759,26 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
|
|
6361
6759
|
ggml_backend_t vk_backend = new ggml_backend {
|
|
6362
6760
|
/* .guid = */ ggml_backend_vk_guid(),
|
|
6363
6761
|
/* .interface = */ ggml_backend_vk_interface,
|
|
6762
|
+
/* .device = */ nullptr,
|
|
6364
6763
|
/* .context = */ ctx,
|
|
6365
6764
|
};
|
|
6366
6765
|
|
|
6367
6766
|
return vk_backend;
|
|
6368
6767
|
}
|
|
6369
6768
|
|
|
6370
|
-
|
|
6769
|
+
bool ggml_backend_is_vk(ggml_backend_t backend) {
|
|
6371
6770
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
|
6372
6771
|
}
|
|
6373
6772
|
|
|
6374
|
-
|
|
6773
|
+
int ggml_backend_vk_get_device_count() {
|
|
6375
6774
|
return ggml_vk_get_device_count();
|
|
6376
6775
|
}
|
|
6377
6776
|
|
|
6378
|
-
|
|
6777
|
+
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
|
|
6379
6778
|
ggml_vk_get_device_description(device, description, description_size);
|
|
6380
6779
|
}
|
|
6381
6780
|
|
|
6382
|
-
|
|
6781
|
+
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
|
6383
6782
|
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
|
6384
6783
|
|
|
6385
6784
|
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
|
@@ -6395,27 +6794,6 @@ GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size
|
|
|
6395
6794
|
}
|
|
6396
6795
|
}
|
|
6397
6796
|
|
|
6398
|
-
// backend registry
|
|
6399
|
-
GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) {
|
|
6400
|
-
ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data);
|
|
6401
|
-
return vk_backend;
|
|
6402
|
-
|
|
6403
|
-
UNUSED(params);
|
|
6404
|
-
}
|
|
6405
|
-
|
|
6406
|
-
extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
|
|
6407
|
-
|
|
6408
|
-
GGML_CALL int ggml_backend_vk_reg_devices() {
|
|
6409
|
-
ggml_vk_instance_init();
|
|
6410
|
-
|
|
6411
|
-
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
|
6412
|
-
char name[128];
|
|
6413
|
-
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
|
|
6414
|
-
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
|
|
6415
|
-
}
|
|
6416
|
-
return vk_instance.device_indices.size();
|
|
6417
|
-
}
|
|
6418
|
-
|
|
6419
6797
|
// Extension availability
|
|
6420
6798
|
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
|
6421
6799
|
#ifdef GGML_VULKAN_VALIDATE
|
|
@@ -6509,17 +6887,19 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
|
6509
6887
|
}
|
|
6510
6888
|
}
|
|
6511
6889
|
|
|
6512
|
-
static void ggml_vk_print_tensor(
|
|
6890
|
+
static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
|
6513
6891
|
void * tensor_data = tensor->data;
|
|
6514
6892
|
|
|
6515
|
-
|
|
6893
|
+
const bool is_gpu = tensor->buffer != nullptr && ggml_backend_buffer_is_vk(tensor->buffer);
|
|
6894
|
+
|
|
6895
|
+
if (is_gpu) {
|
|
6516
6896
|
const size_t tensor_size = ggml_nbytes(tensor);
|
|
6517
6897
|
tensor_data = malloc(tensor_size);
|
|
6518
6898
|
|
|
6519
|
-
|
|
6899
|
+
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
|
6520
6900
|
|
|
6521
|
-
vk_buffer buffer_gpu =
|
|
6522
|
-
ggml_vk_buffer_read(buffer_gpu,
|
|
6901
|
+
vk_buffer buffer_gpu = buf_ctx->dev_buffer;
|
|
6902
|
+
ggml_vk_buffer_read(buffer_gpu, vk_tensor_offset(tensor) + tensor->view_offs, tensor_data, tensor_size);
|
|
6523
6903
|
}
|
|
6524
6904
|
|
|
6525
6905
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
|
@@ -6533,13 +6913,10 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
|
6533
6913
|
std::cerr << std::endl << "Result:" << std::endl;
|
|
6534
6914
|
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
|
6535
6915
|
std::cerr << std::endl;
|
|
6536
|
-
std::cerr << std::endl << "Result:" << std::endl;
|
|
6537
|
-
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
|
|
6538
|
-
std::cerr << std::endl;
|
|
6539
6916
|
std::vector<const ggml_tensor *> done;
|
|
6540
6917
|
ggml_vk_print_graph_origin(tensor, done);
|
|
6541
6918
|
|
|
6542
|
-
if (
|
|
6919
|
+
if (is_gpu) {
|
|
6543
6920
|
free(tensor_data);
|
|
6544
6921
|
}
|
|
6545
6922
|
}
|
|
@@ -6548,8 +6925,8 @@ void * comp_result;
|
|
|
6548
6925
|
size_t comp_size;
|
|
6549
6926
|
size_t comp_nb[GGML_MAX_DIMS];
|
|
6550
6927
|
size_t check_counter = 0;
|
|
6551
|
-
static void ggml_vk_check_results_0(
|
|
6552
|
-
|
|
6928
|
+
static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
|
6929
|
+
if (tensor->op == GGML_OP_TRANSPOSE) {
|
|
6553
6930
|
return;
|
|
6554
6931
|
}
|
|
6555
6932
|
|
|
@@ -6565,7 +6942,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6565
6942
|
ggml_tensor * src2 = tensor->src[2];
|
|
6566
6943
|
|
|
6567
6944
|
struct ggml_init_params iparams = {
|
|
6568
|
-
/*.mem_size =*/
|
|
6945
|
+
/*.mem_size =*/ 2ul*1024ul*1024ul*1024ul,
|
|
6569
6946
|
/*.mem_buffer =*/ NULL,
|
|
6570
6947
|
/*.no_alloc =*/ false,
|
|
6571
6948
|
};
|
|
@@ -6596,9 +6973,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6596
6973
|
memcpy(src0_clone->data, src0->data, src0_size);
|
|
6597
6974
|
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
|
6598
6975
|
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
|
6599
|
-
|
|
6600
|
-
vk_buffer buffer_gpu =
|
|
6601
|
-
uint64_t offset =
|
|
6976
|
+
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
|
6977
|
+
vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
|
|
6978
|
+
uint64_t offset = vk_tensor_offset(src0) + src0->view_offs;
|
|
6602
6979
|
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
|
6603
6980
|
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
|
6604
6981
|
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
|
@@ -6624,7 +7001,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6624
7001
|
}
|
|
6625
7002
|
|
|
6626
7003
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
|
6627
|
-
ggml_vk_print_tensor(
|
|
7004
|
+
ggml_vk_print_tensor(src0, "src0");
|
|
6628
7005
|
}
|
|
6629
7006
|
}
|
|
6630
7007
|
if (src1 != nullptr) {
|
|
@@ -6638,9 +7015,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6638
7015
|
memcpy(src1_clone->data, src1->data, src1_size);
|
|
6639
7016
|
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
|
6640
7017
|
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
|
6641
|
-
|
|
6642
|
-
vk_buffer buffer_gpu =
|
|
6643
|
-
uint64_t offset =
|
|
7018
|
+
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
|
|
7019
|
+
vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
|
|
7020
|
+
uint64_t offset = vk_tensor_offset(src1) + src1->view_offs;
|
|
6644
7021
|
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
|
6645
7022
|
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
|
6646
7023
|
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
|
@@ -6666,23 +7043,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6666
7043
|
}
|
|
6667
7044
|
|
|
6668
7045
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
|
6669
|
-
ggml_vk_print_tensor(
|
|
6670
|
-
std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
|
|
6671
|
-
std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
|
|
6672
|
-
if (src1->src[0] != nullptr) {
|
|
6673
|
-
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
|
|
6674
|
-
}
|
|
6675
|
-
if (src1->src[1] != nullptr) {
|
|
6676
|
-
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
|
|
6677
|
-
}
|
|
6678
|
-
std::cerr << std::endl << "Result:" << std::endl;
|
|
6679
|
-
ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
|
|
6680
|
-
std::cerr << std::endl;
|
|
6681
|
-
std::cerr << std::endl << "Result:" << std::endl;
|
|
6682
|
-
ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 1, 0);
|
|
6683
|
-
std::cerr << std::endl;
|
|
6684
|
-
std::vector<const ggml_tensor *> done;
|
|
6685
|
-
ggml_vk_print_graph_origin(src1_clone, done);
|
|
7046
|
+
ggml_vk_print_tensor(src1, "src1");
|
|
6686
7047
|
}
|
|
6687
7048
|
}
|
|
6688
7049
|
if (src2 != nullptr) {
|
|
@@ -6696,9 +7057,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6696
7057
|
memcpy(src2_clone->data, src2->data, src2_size);
|
|
6697
7058
|
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
|
6698
7059
|
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
|
6699
|
-
|
|
6700
|
-
vk_buffer buffer_gpu =
|
|
6701
|
-
uint64_t offset =
|
|
7060
|
+
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src2->buffer->context;
|
|
7061
|
+
vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
|
|
7062
|
+
uint64_t offset = vk_tensor_offset(src2) + src2->view_offs;
|
|
6702
7063
|
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
|
6703
7064
|
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
|
6704
7065
|
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
|
@@ -6724,23 +7085,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6724
7085
|
}
|
|
6725
7086
|
|
|
6726
7087
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
|
6727
|
-
ggml_vk_print_tensor(
|
|
6728
|
-
std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
|
|
6729
|
-
std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
|
|
6730
|
-
if (src2->src[0] != nullptr) {
|
|
6731
|
-
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
|
|
6732
|
-
}
|
|
6733
|
-
if (src2->src[1] != nullptr) {
|
|
6734
|
-
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
|
|
6735
|
-
}
|
|
6736
|
-
std::cerr << std::endl << "Result:" << std::endl;
|
|
6737
|
-
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
|
|
6738
|
-
std::cerr << std::endl;
|
|
6739
|
-
std::cerr << std::endl << "Result:" << std::endl;
|
|
6740
|
-
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
|
|
6741
|
-
std::cerr << std::endl;
|
|
6742
|
-
std::vector<const ggml_tensor *> done;
|
|
6743
|
-
ggml_vk_print_graph_origin(src2_clone, done);
|
|
7088
|
+
ggml_vk_print_tensor(src2, "src2");
|
|
6744
7089
|
}
|
|
6745
7090
|
}
|
|
6746
7091
|
|
|
@@ -6752,16 +7097,32 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6752
7097
|
tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone);
|
|
6753
7098
|
} else if (tensor->op == GGML_OP_DIV) {
|
|
6754
7099
|
tensor_clone = ggml_div(ggml_ctx, src0_clone, src1_clone);
|
|
7100
|
+
} else if (tensor->op == GGML_OP_CONCAT) {
|
|
7101
|
+
tensor_clone = ggml_concat(ggml_ctx, src0_clone, src1_clone, *(int *)tensor->op_params);
|
|
7102
|
+
} else if (tensor->op == GGML_OP_UPSCALE) {
|
|
7103
|
+
tensor_clone = ggml_upscale_ext(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
|
6755
7104
|
} else if (tensor->op == GGML_OP_SCALE) {
|
|
6756
7105
|
tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
|
|
6757
7106
|
} else if (tensor->op == GGML_OP_SQR) {
|
|
6758
7107
|
tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
|
|
7108
|
+
} else if (tensor->op == GGML_OP_SIN) {
|
|
7109
|
+
tensor_clone = ggml_sin(ggml_ctx, src0_clone);
|
|
7110
|
+
} else if (tensor->op == GGML_OP_COS) {
|
|
7111
|
+
tensor_clone = ggml_cos(ggml_ctx, src0_clone);
|
|
6759
7112
|
} else if (tensor->op == GGML_OP_CLAMP) {
|
|
6760
7113
|
tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
|
7114
|
+
} else if (tensor->op == GGML_OP_PAD) {
|
|
7115
|
+
tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]);
|
|
7116
|
+
} else if (tensor->op == GGML_OP_REPEAT) {
|
|
7117
|
+
tensor_clone = ggml_repeat(ggml_ctx, src0_clone, tensor);
|
|
6761
7118
|
} else if (tensor->op == GGML_OP_ADD) {
|
|
6762
7119
|
tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
|
|
7120
|
+
} else if (tensor->op == GGML_OP_ACC) {
|
|
7121
|
+
tensor_clone = ggml_acc(ggml_ctx, src0_clone, src1_clone, tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
|
|
6763
7122
|
} else if (tensor->op == GGML_OP_NORM) {
|
|
6764
7123
|
tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
|
|
7124
|
+
} else if (tensor->op == GGML_OP_GROUP_NORM) {
|
|
7125
|
+
tensor_clone = ggml_group_norm(ggml_ctx, src0_clone, *(int *)tensor->op_params, ((float *)tensor->op_params)[1]);
|
|
6765
7126
|
} else if (tensor->op == GGML_OP_RMS_NORM) {
|
|
6766
7127
|
tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
|
|
6767
7128
|
} else if (tensor->op == GGML_OP_SOFT_MAX) {
|
|
@@ -6777,12 +7138,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6777
7138
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
|
6778
7139
|
//const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3];
|
|
6779
7140
|
const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4];
|
|
6780
|
-
float freq_base = ((float *)
|
|
6781
|
-
float freq_scale = ((float *)
|
|
6782
|
-
float ext_factor = ((float *)
|
|
6783
|
-
float attn_factor = ((float *)
|
|
6784
|
-
float beta_fast = ((float *)
|
|
6785
|
-
float beta_slow = ((float *)
|
|
7141
|
+
const float freq_base = ((float *) tensor->op_params)[5];
|
|
7142
|
+
const float freq_scale = ((float *) tensor->op_params)[6];
|
|
7143
|
+
const float ext_factor = ((float *) tensor->op_params)[7];
|
|
7144
|
+
const float attn_factor = ((float *) tensor->op_params)[8];
|
|
7145
|
+
const float beta_fast = ((float *) tensor->op_params)[9];
|
|
7146
|
+
const float beta_slow = ((float *) tensor->op_params)[10];
|
|
6786
7147
|
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
|
6787
7148
|
} else if (tensor->op == GGML_OP_UNARY) {
|
|
6788
7149
|
switch (ggml_get_unary_op(tensor)) {
|
|
@@ -6792,9 +7153,15 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6792
7153
|
case GGML_UNARY_OP_GELU:
|
|
6793
7154
|
tensor_clone = ggml_gelu(ggml_ctx, src0_clone);
|
|
6794
7155
|
break;
|
|
7156
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
7157
|
+
tensor_clone = ggml_gelu_quick(ggml_ctx, src0_clone);
|
|
7158
|
+
break;
|
|
6795
7159
|
case GGML_UNARY_OP_RELU:
|
|
6796
7160
|
tensor_clone = ggml_relu(ggml_ctx, src0_clone);
|
|
6797
7161
|
break;
|
|
7162
|
+
case GGML_UNARY_OP_TANH:
|
|
7163
|
+
tensor_clone = ggml_tanh(ggml_ctx, src0_clone);
|
|
7164
|
+
break;
|
|
6798
7165
|
default:
|
|
6799
7166
|
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
|
6800
7167
|
GGML_ABORT("fatal error");
|
|
@@ -6823,6 +7190,23 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6823
7190
|
tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params);
|
|
6824
7191
|
} else if (tensor->op == GGML_OP_SUM_ROWS) {
|
|
6825
7192
|
tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
|
|
7193
|
+
} else if (tensor->op == GGML_OP_IM2COL) {
|
|
7194
|
+
const int32_t s0 = tensor->op_params[0];
|
|
7195
|
+
const int32_t s1 = tensor->op_params[1];
|
|
7196
|
+
const int32_t p0 = tensor->op_params[2];
|
|
7197
|
+
const int32_t p1 = tensor->op_params[3];
|
|
7198
|
+
const int32_t d0 = tensor->op_params[4];
|
|
7199
|
+
const int32_t d1 = tensor->op_params[5];
|
|
7200
|
+
|
|
7201
|
+
const bool is_2D = tensor->op_params[6] == 1;
|
|
7202
|
+
tensor_clone = ggml_im2col(ggml_ctx, src0_clone, src1_clone, s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
|
|
7203
|
+
} else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
|
|
7204
|
+
const int32_t dim = tensor->op_params[0];
|
|
7205
|
+
const int32_t max_period = tensor->op_params[1];
|
|
7206
|
+
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
|
|
7207
|
+
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
|
7208
|
+
const float * op_params = (const float *)tensor->op_params;
|
|
7209
|
+
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
|
|
6826
7210
|
} else {
|
|
6827
7211
|
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
|
6828
7212
|
GGML_ABORT("fatal error");
|
|
@@ -6834,7 +7218,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6834
7218
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
|
6835
7219
|
|
|
6836
7220
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
|
6837
|
-
ggml_vk_print_tensor(
|
|
7221
|
+
ggml_vk_print_tensor(tensor_clone, "tensor_clone");
|
|
6838
7222
|
}
|
|
6839
7223
|
|
|
6840
7224
|
comp_size = ggml_nbytes(tensor_clone);
|
|
@@ -6851,9 +7235,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6851
7235
|
}
|
|
6852
7236
|
|
|
6853
7237
|
ggml_free(ggml_ctx);
|
|
7238
|
+
|
|
7239
|
+
VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")");
|
|
6854
7240
|
}
|
|
6855
7241
|
|
|
6856
|
-
static void ggml_vk_check_results_1(
|
|
7242
|
+
static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
|
6857
7243
|
if (tensor->op == GGML_OP_TRANSPOSE) {
|
|
6858
7244
|
return;
|
|
6859
7245
|
}
|
|
@@ -6873,14 +7259,15 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6873
7259
|
size_t tensor_size = ggml_nbytes(tensor);
|
|
6874
7260
|
tensor_data = malloc(tensor_size);
|
|
6875
7261
|
|
|
6876
|
-
|
|
7262
|
+
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
|
6877
7263
|
|
|
6878
|
-
vk_buffer buffer_gpu =
|
|
6879
|
-
|
|
6880
|
-
|
|
7264
|
+
vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
|
|
7265
|
+
uint64_t offset = vk_tensor_offset(tensor) + tensor->view_offs;
|
|
7266
|
+
if (offset + tensor_size >= buffer_gpu->size) {
|
|
7267
|
+
tensor_size = buffer_gpu->size - offset;
|
|
6881
7268
|
}
|
|
6882
7269
|
|
|
6883
|
-
ggml_vk_buffer_read(buffer_gpu,
|
|
7270
|
+
ggml_vk_buffer_read(buffer_gpu, offset, tensor_data, tensor_size);
|
|
6884
7271
|
}
|
|
6885
7272
|
|
|
6886
7273
|
float first_error_result = -1.0f;
|
|
@@ -6977,11 +7364,6 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
6977
7364
|
std::cerr << std::endl << "Correct:" << std::endl;
|
|
6978
7365
|
ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0);
|
|
6979
7366
|
std::cerr << std::endl;
|
|
6980
|
-
std::cerr << std::endl << "Result:" << std::endl;
|
|
6981
|
-
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
|
|
6982
|
-
std::cerr << std::endl << "Correct:" << std::endl;
|
|
6983
|
-
ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 1, 0);
|
|
6984
|
-
std::cerr << std::endl;
|
|
6985
7367
|
std::vector<const ggml_tensor *> done;
|
|
6986
7368
|
ggml_vk_print_graph_origin(tensor, done);
|
|
6987
7369
|
}
|
|
@@ -7018,5 +7400,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
7018
7400
|
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
|
7019
7401
|
free(tensor_data);
|
|
7020
7402
|
}
|
|
7403
|
+
|
|
7404
|
+
VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
|
|
7021
7405
|
}
|
|
7022
7406
|
#endif
|