llama_cpp 0.15.4 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +3 -2
- data/ext/llama_cpp/llama_cpp.cpp +17 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +166 -82
- data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
- data/vendor/tmp/llama.cpp/ggml.c +278 -603
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +345 -473
- data/vendor/tmp/llama.cpp/llama.h +21 -43
- metadata +134 -7
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#include "ggml-vulkan.h"
|
|
2
|
-
|
|
2
|
+
#include <vulkan/vulkan_core.h>
|
|
3
3
|
#ifdef GGML_VULKAN_RUN_TESTS
|
|
4
4
|
#include <chrono>
|
|
5
5
|
#endif
|
|
@@ -9,12 +9,13 @@
|
|
|
9
9
|
#include <algorithm>
|
|
10
10
|
#include <cmath>
|
|
11
11
|
#include <iostream>
|
|
12
|
-
#include <limits>
|
|
13
12
|
#include <tuple>
|
|
14
13
|
#include <vector>
|
|
15
14
|
#include <sstream>
|
|
16
15
|
#include <utility>
|
|
17
16
|
#include <memory>
|
|
17
|
+
#include <limits>
|
|
18
|
+
#include <map>
|
|
18
19
|
|
|
19
20
|
#include "ggml.h"
|
|
20
21
|
#include "ggml-backend-impl.h"
|
|
@@ -137,6 +138,7 @@ struct vk_device {
|
|
|
137
138
|
vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
|
|
138
139
|
vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
|
|
139
140
|
vk_pipeline pipeline_mul_f32;
|
|
141
|
+
vk_pipeline pipeline_div_f32;
|
|
140
142
|
vk_pipeline pipeline_add_f32;
|
|
141
143
|
vk_pipeline pipeline_scale_f32;
|
|
142
144
|
vk_pipeline pipeline_sqr_f32;
|
|
@@ -149,9 +151,10 @@ struct vk_device {
|
|
|
149
151
|
vk_pipeline pipeline_relu_f32;
|
|
150
152
|
vk_pipeline pipeline_diag_mask_inf_f32;
|
|
151
153
|
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
|
152
|
-
vk_pipeline
|
|
154
|
+
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
|
|
153
155
|
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
|
154
156
|
vk_pipeline pipeline_argsort_f32;
|
|
157
|
+
vk_pipeline pipeline_sum_rows_f32;
|
|
155
158
|
|
|
156
159
|
std::vector<vk_pipeline_ref> pipelines;
|
|
157
160
|
|
|
@@ -226,17 +229,27 @@ typedef std::vector<vk_submission> vk_sequence;
|
|
|
226
229
|
|
|
227
230
|
struct vk_mat_mat_push_constants {
|
|
228
231
|
uint32_t M; uint32_t N; uint32_t K;
|
|
229
|
-
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
|
230
|
-
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
|
|
232
|
+
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
|
231
233
|
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
|
232
|
-
uint32_t
|
|
233
|
-
uint32_t
|
|
234
|
+
uint32_t k_split;
|
|
235
|
+
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
|
|
234
236
|
};
|
|
235
|
-
|
|
236
237
|
struct vk_mat_vec_push_constants {
|
|
237
238
|
uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
|
239
|
+
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
|
238
240
|
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
|
|
241
|
+
};
|
|
242
|
+
|
|
243
|
+
struct vk_mat_mat_id_push_constants {
|
|
244
|
+
uint32_t M; uint32_t N; uint32_t K;
|
|
245
|
+
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
|
246
|
+
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
|
247
|
+
uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11;
|
|
248
|
+
};
|
|
249
|
+
struct vk_mat_vec_id_push_constants {
|
|
250
|
+
uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
|
239
251
|
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
|
252
|
+
uint32_t nei0; uint32_t ne11;
|
|
240
253
|
};
|
|
241
254
|
|
|
242
255
|
struct vk_op_push_constants {
|
|
@@ -271,26 +284,15 @@ struct vk_op_diag_mask_push_constants {
|
|
|
271
284
|
|
|
272
285
|
struct vk_op_rope_push_constants {
|
|
273
286
|
uint32_t ncols;
|
|
287
|
+
uint32_t n_dims;
|
|
274
288
|
float freq_scale;
|
|
275
289
|
uint32_t p_delta_rows;
|
|
276
290
|
float freq_base;
|
|
277
291
|
float ext_factor;
|
|
278
292
|
float attn_factor;
|
|
279
|
-
float corr_dims[
|
|
280
|
-
};
|
|
281
|
-
|
|
282
|
-
struct vk_op_rope_neox_push_constants {
|
|
283
|
-
uint32_t ncols;
|
|
284
|
-
uint32_t ndims;
|
|
285
|
-
float freq_scale;
|
|
286
|
-
uint32_t p_delta_rows;
|
|
287
|
-
float freq_base;
|
|
288
|
-
float ext_factor;
|
|
289
|
-
float attn_factor;
|
|
290
|
-
float corr_dims[4];
|
|
293
|
+
float corr_dims[2];
|
|
291
294
|
float theta_scale;
|
|
292
|
-
|
|
293
|
-
uint32_t has_freq_facs;
|
|
295
|
+
uint32_t has_ff;
|
|
294
296
|
};
|
|
295
297
|
|
|
296
298
|
struct vk_op_soft_max_push_constants {
|
|
@@ -333,15 +335,12 @@ struct vk_context {
|
|
|
333
335
|
};
|
|
334
336
|
|
|
335
337
|
struct ggml_tensor_extra_gpu {
|
|
336
|
-
bool ready;
|
|
337
|
-
|
|
338
338
|
size_t ctx_idx;
|
|
339
339
|
|
|
340
340
|
vk_buffer_ref buffer_gpu;
|
|
341
341
|
uint64_t offset;
|
|
342
342
|
|
|
343
343
|
void reset() {
|
|
344
|
-
ready = false;
|
|
345
344
|
ctx_idx = 0;
|
|
346
345
|
buffer_gpu.reset();
|
|
347
346
|
offset = 0;
|
|
@@ -1028,7 +1027,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1028
1027
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1029
1028
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1030
1029
|
|
|
1031
|
-
|
|
1030
|
+
ctx->device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1032
1031
|
ctx->device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1033
1032
|
ctx->device->pipeline_matmul_id_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1034
1033
|
ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
@@ -1040,7 +1039,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1040
1039
|
ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1041
1040
|
ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1042
1041
|
ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1043
|
-
ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>()
|
|
1042
|
+
ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
1044
1043
|
|
|
1045
1044
|
if (device->fp16) {
|
|
1046
1045
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
|
@@ -1078,12 +1077,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1078
1077
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1079
1078
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1080
1079
|
|
|
1081
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "
|
|
1082
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "
|
|
1083
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "
|
|
1084
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "
|
|
1085
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "
|
|
1086
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "
|
|
1080
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_1_f32_l", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1081
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_1_f32_m", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1082
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_1_f32_s", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1083
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_1_f32_aligned_l", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1084
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_1_f32_aligned_m", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1085
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_1_f32_aligned_s", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1087
1086
|
|
|
1088
1087
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1089
1088
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
@@ -1141,96 +1140,96 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1141
1140
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1142
1141
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1143
1142
|
|
|
1144
|
-
|
|
1145
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(
|
|
1146
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(
|
|
1147
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(
|
|
1148
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(
|
|
1149
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(
|
|
1150
|
-
|
|
1151
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(
|
|
1152
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(
|
|
1153
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(
|
|
1154
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(
|
|
1155
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(
|
|
1156
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(
|
|
1157
|
-
|
|
1158
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(
|
|
1159
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(
|
|
1160
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(
|
|
1161
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(
|
|
1162
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(
|
|
1163
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(
|
|
1164
|
-
|
|
1165
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(
|
|
1166
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(
|
|
1167
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(
|
|
1168
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(
|
|
1169
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(
|
|
1170
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(
|
|
1171
|
-
|
|
1172
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "
|
|
1173
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "
|
|
1174
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "
|
|
1175
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "
|
|
1176
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "
|
|
1177
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "
|
|
1178
|
-
|
|
1179
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(
|
|
1180
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(
|
|
1181
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(
|
|
1182
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(
|
|
1183
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(
|
|
1184
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(
|
|
1185
|
-
|
|
1186
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(
|
|
1187
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(
|
|
1188
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(
|
|
1189
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(
|
|
1190
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(
|
|
1191
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(
|
|
1192
|
-
|
|
1193
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(
|
|
1194
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(
|
|
1195
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(
|
|
1196
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(
|
|
1197
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(
|
|
1198
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(
|
|
1199
|
-
|
|
1200
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(
|
|
1201
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(
|
|
1202
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(
|
|
1203
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(
|
|
1204
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(
|
|
1205
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(
|
|
1206
|
-
|
|
1207
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(
|
|
1208
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(
|
|
1209
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(
|
|
1210
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(
|
|
1211
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(
|
|
1212
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(
|
|
1213
|
-
|
|
1214
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(
|
|
1215
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(
|
|
1216
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(
|
|
1217
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(
|
|
1218
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(
|
|
1219
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(
|
|
1220
|
-
|
|
1221
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(
|
|
1222
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(
|
|
1223
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(
|
|
1224
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(
|
|
1225
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(
|
|
1226
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(
|
|
1227
|
-
|
|
1228
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(
|
|
1229
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(
|
|
1230
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(
|
|
1231
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(
|
|
1232
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(
|
|
1233
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(
|
|
1143
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1144
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
|
1145
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
1146
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
|
1147
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
|
1148
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
|
1149
|
+
|
|
1150
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1151
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
|
1152
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
1153
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
|
1154
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
|
1155
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
|
1156
|
+
|
|
1157
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1158
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
|
1159
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
1160
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
|
1161
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
|
1162
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
|
1163
|
+
|
|
1164
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1165
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1166
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1167
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1168
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1169
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1170
|
+
|
|
1171
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "matmul_id_q4_1_f32_l", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1172
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "matmul_id_q4_1_f32_m", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1173
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "matmul_id_q4_1_f32_s", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1174
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "matmul_id_q4_1_f32_aligned_l", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1175
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "matmul_id_q4_1_f32_aligned_m", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1176
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "matmul_id_q4_1_f32_aligned_s", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1177
|
+
|
|
1178
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1179
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1180
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1181
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1182
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1183
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1184
|
+
|
|
1185
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1186
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1187
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1188
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1189
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1190
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1191
|
+
|
|
1192
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1193
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1194
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1195
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1196
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1197
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1198
|
+
|
|
1199
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1200
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1201
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1202
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1203
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1204
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1205
|
+
|
|
1206
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1207
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1208
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1209
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1210
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1211
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1212
|
+
|
|
1213
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1214
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1215
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1216
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1217
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1218
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1219
|
+
|
|
1220
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1221
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1222
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1223
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1224
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1225
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1226
|
+
|
|
1227
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1228
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1229
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1230
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1231
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1232
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1234
1233
|
} else {
|
|
1235
1234
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1236
1235
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
|
@@ -1330,99 +1329,100 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1330
1329
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1331
1330
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1332
1331
|
|
|
1333
|
-
|
|
1334
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(
|
|
1335
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(
|
|
1336
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1337
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1338
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1339
|
-
|
|
1340
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(
|
|
1341
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(
|
|
1342
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(
|
|
1343
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(
|
|
1344
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(
|
|
1345
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(
|
|
1346
|
-
|
|
1347
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(
|
|
1348
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(
|
|
1349
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(
|
|
1350
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1351
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1352
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1353
|
-
|
|
1354
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(
|
|
1355
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(
|
|
1356
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(
|
|
1357
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1358
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1359
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1360
|
-
|
|
1361
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "
|
|
1362
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "
|
|
1363
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "
|
|
1364
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "
|
|
1365
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "
|
|
1366
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "
|
|
1367
|
-
|
|
1368
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(
|
|
1369
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(
|
|
1370
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(
|
|
1371
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1372
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1373
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1374
|
-
|
|
1375
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(
|
|
1376
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(
|
|
1377
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(
|
|
1378
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1379
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1380
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1381
|
-
|
|
1382
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(
|
|
1383
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(
|
|
1384
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(
|
|
1385
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1386
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1387
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1388
|
-
|
|
1389
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(
|
|
1390
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(
|
|
1391
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(
|
|
1392
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1393
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1394
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1395
|
-
|
|
1396
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(
|
|
1397
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(
|
|
1398
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(
|
|
1399
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1400
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1401
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1402
|
-
|
|
1403
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(
|
|
1404
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(
|
|
1405
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(
|
|
1406
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1407
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1408
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1409
|
-
|
|
1410
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(
|
|
1411
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(
|
|
1412
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(
|
|
1413
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1414
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1415
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1416
|
-
|
|
1417
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(
|
|
1418
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(
|
|
1419
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(
|
|
1420
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1421
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1422
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(
|
|
1332
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1333
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
|
1334
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
1335
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
|
1336
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
|
1337
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
|
1338
|
+
|
|
1339
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1340
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
|
1341
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
1342
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
|
1343
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
|
1344
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
|
1345
|
+
|
|
1346
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
|
1347
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
|
1348
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
1349
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
|
1350
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
|
1351
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
|
1352
|
+
|
|
1353
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1354
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1355
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1356
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1357
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1358
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1359
|
+
|
|
1360
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "matmul_id_q4_1_f32_l", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1361
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "matmul_id_q4_1_f32_m", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1362
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "matmul_id_q4_1_f32_s", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1363
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "matmul_id_q4_1_f32_aligned_l", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1364
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "matmul_id_q4_1_f32_aligned_m", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1365
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "matmul_id_q4_1_f32_aligned_s", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1366
|
+
|
|
1367
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1368
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1369
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1370
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1371
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1372
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1373
|
+
|
|
1374
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1375
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1376
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1377
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1378
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1379
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1380
|
+
|
|
1381
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1382
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1383
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1384
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1385
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1386
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1387
|
+
|
|
1388
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1389
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1390
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1391
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1392
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1393
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1394
|
+
|
|
1395
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1396
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1397
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1398
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1399
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1400
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1401
|
+
|
|
1402
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1403
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1404
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1405
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1406
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1407
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1408
|
+
|
|
1409
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1410
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1411
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1412
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1413
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1414
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1415
|
+
|
|
1416
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1417
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1418
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1419
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
|
1420
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
|
1421
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
1423
1422
|
}
|
|
1424
1423
|
|
|
1425
1424
|
// mul mat vec
|
|
1425
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1426
1426
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1427
1427
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1428
1428
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
@@ -1435,6 +1435,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1435
1435
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32_f32", mul_mat_vec_q5_K_f32_f32_len, mul_mat_vec_q5_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1436
1436
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32_f32", mul_mat_vec_q6_K_f32_f32_len, mul_mat_vec_q6_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1437
1437
|
|
|
1438
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1438
1439
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1439
1440
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1440
1441
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
@@ -1447,17 +1448,18 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1447
1448
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f16_f32", mul_mat_vec_q5_K_f16_f32_len, mul_mat_vec_q5_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1448
1449
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f16_f32", mul_mat_vec_q6_K_f16_f32_len, mul_mat_vec_q6_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1449
1450
|
|
|
1450
|
-
|
|
1451
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
1452
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
1453
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
1454
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
1455
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
1456
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
1457
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
1458
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
1459
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
1460
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[
|
|
1451
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1452
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1453
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1454
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1455
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1456
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1457
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1458
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_K_f32", mul_mat_vec_id_q2_K_f32_len, mul_mat_vec_id_q2_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1459
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_K_f32", mul_mat_vec_id_q3_K_f32_len, mul_mat_vec_id_q3_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1460
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_K_f32", mul_mat_vec_id_q4_K_f32_len, mul_mat_vec_id_q4_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1461
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_K_f32", mul_mat_vec_id_q5_K_f32_len, mul_mat_vec_id_q5_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1462
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_K_f32", mul_mat_vec_id_q6_K_f32_len, mul_mat_vec_id_q6_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1461
1463
|
|
|
1462
1464
|
// dequant shaders
|
|
1463
1465
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
|
@@ -1505,6 +1507,8 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1505
1507
|
|
|
1506
1508
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
1507
1509
|
|
|
1510
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
1511
|
+
|
|
1508
1512
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
1509
1513
|
|
|
1510
1514
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
@@ -1520,13 +1524,15 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1520
1524
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
|
1521
1525
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
|
1522
1526
|
|
|
1523
|
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
|
1524
|
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
|
1527
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1528
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1525
1529
|
|
|
1526
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(
|
|
1527
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(
|
|
1530
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1531
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1528
1532
|
|
|
1529
1533
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
|
1534
|
+
|
|
1535
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
1530
1536
|
}
|
|
1531
1537
|
|
|
1532
1538
|
static void ggml_vk_print_gpu_info(size_t idx) {
|
|
@@ -1550,8 +1556,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
|
1550
1556
|
vk::PhysicalDeviceProperties2 props2;
|
|
1551
1557
|
vk::PhysicalDeviceMaintenance3Properties props3;
|
|
1552
1558
|
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
|
1559
|
+
vk::PhysicalDeviceDriverProperties driver_props;
|
|
1553
1560
|
props2.pNext = &props3;
|
|
1554
1561
|
props3.pNext = &subgroup_props;
|
|
1562
|
+
subgroup_props.pNext = &driver_props;
|
|
1555
1563
|
physical_device.getProperties2(&props2);
|
|
1556
1564
|
|
|
1557
1565
|
const size_t subgroup_size = subgroup_props.subgroupSize;
|
|
@@ -1595,7 +1603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
|
1595
1603
|
fp16 = fp16 && vk12_features.shaderFloat16;
|
|
1596
1604
|
|
|
1597
1605
|
std::string device_name = props2.properties.deviceName.data();
|
|
1598
|
-
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
|
1606
|
+
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
|
1599
1607
|
|
|
1600
1608
|
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
|
1601
1609
|
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
|
@@ -1691,7 +1699,78 @@ void ggml_vk_instance_init() {
|
|
|
1691
1699
|
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
|
1692
1700
|
|
|
1693
1701
|
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
|
1694
|
-
|
|
1702
|
+
// Check if there are two physical devices corresponding to the same GPU
|
|
1703
|
+
auto old_device = std::find_if(
|
|
1704
|
+
vk_instance.device_indices.begin(),
|
|
1705
|
+
vk_instance.device_indices.end(),
|
|
1706
|
+
[&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
|
|
1707
|
+
);
|
|
1708
|
+
if (old_device == vk_instance.device_indices.end()) {
|
|
1709
|
+
vk_instance.device_indices.push_back(i);
|
|
1710
|
+
} else {
|
|
1711
|
+
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
|
|
1712
|
+
// This can cause error when splitting layers aross the devices, need to keep only 1
|
|
1713
|
+
#ifdef GGML_VULKAN_DEBUG
|
|
1714
|
+
std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
|
|
1715
|
+
#endif
|
|
1716
|
+
|
|
1717
|
+
vk::PhysicalDeviceProperties2 old_prop;
|
|
1718
|
+
vk::PhysicalDeviceDriverProperties old_driver;
|
|
1719
|
+
old_prop.pNext = &old_driver;
|
|
1720
|
+
devices[*old_device].getProperties2(&old_prop);
|
|
1721
|
+
|
|
1722
|
+
vk::PhysicalDeviceProperties2 new_prop;
|
|
1723
|
+
vk::PhysicalDeviceDriverProperties new_driver;
|
|
1724
|
+
new_prop.pNext = &new_driver;
|
|
1725
|
+
devices[i].getProperties2(&new_prop);
|
|
1726
|
+
|
|
1727
|
+
std::map<vk::DriverId, int> driver_priorities {};
|
|
1728
|
+
int old_priority = std::numeric_limits<int>::max();
|
|
1729
|
+
int new_priority = std::numeric_limits<int>::max();
|
|
1730
|
+
|
|
1731
|
+
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
|
|
1732
|
+
// Smaller number -> higher priority
|
|
1733
|
+
switch (old_prop.properties.vendorID) {
|
|
1734
|
+
case VK_VENDOR_ID_AMD:
|
|
1735
|
+
driver_priorities[vk::DriverId::eMesaRadv] = 1;
|
|
1736
|
+
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
|
|
1737
|
+
driver_priorities[vk::DriverId::eAmdProprietary] = 3;
|
|
1738
|
+
break;
|
|
1739
|
+
case VK_VENDOR_ID_INTEL:
|
|
1740
|
+
driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
|
|
1741
|
+
driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
|
|
1742
|
+
break;
|
|
1743
|
+
case VK_VENDOR_ID_NVIDIA:
|
|
1744
|
+
driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
|
|
1745
|
+
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
|
|
1746
|
+
driver_priorities[vk::DriverId::eMesaNvk] = 2;
|
|
1747
|
+
#endif
|
|
1748
|
+
break;
|
|
1749
|
+
}
|
|
1750
|
+
|
|
1751
|
+
if (driver_priorities.count(old_driver.driverID)) {
|
|
1752
|
+
old_priority = driver_priorities[old_driver.driverID];
|
|
1753
|
+
}
|
|
1754
|
+
if (driver_priorities.count(new_driver.driverID)) {
|
|
1755
|
+
new_priority = driver_priorities[new_driver.driverID];
|
|
1756
|
+
}
|
|
1757
|
+
|
|
1758
|
+
if (new_priority < old_priority) {
|
|
1759
|
+
auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
|
|
1760
|
+
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
|
|
1761
|
+
vk_instance.device_indices.push_back(i);
|
|
1762
|
+
|
|
1763
|
+
#ifdef GGML_VULKAN_DEBUG
|
|
1764
|
+
std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
|
|
1765
|
+
#endif
|
|
1766
|
+
}
|
|
1767
|
+
#ifdef GGML_VULKAN_DEBUG
|
|
1768
|
+
else {
|
|
1769
|
+
std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
|
|
1770
|
+
|
|
1771
|
+
}
|
|
1772
|
+
#endif
|
|
1773
|
+
}
|
|
1695
1774
|
}
|
|
1696
1775
|
}
|
|
1697
1776
|
|
|
@@ -1949,6 +2028,33 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
|
1949
2028
|
return ctx->device->pipeline_dequant_mul_mat_mat[src0_type];
|
|
1950
2029
|
}
|
|
1951
2030
|
|
|
2031
|
+
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
|
2032
|
+
#ifdef GGML_VULKAN_DEBUG
|
|
2033
|
+
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
|
2034
|
+
#endif
|
|
2035
|
+
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
|
|
2036
|
+
|
|
2037
|
+
switch (a_type) {
|
|
2038
|
+
case GGML_TYPE_F32:
|
|
2039
|
+
case GGML_TYPE_F16:
|
|
2040
|
+
case GGML_TYPE_Q4_0:
|
|
2041
|
+
case GGML_TYPE_Q4_1:
|
|
2042
|
+
case GGML_TYPE_Q5_0:
|
|
2043
|
+
case GGML_TYPE_Q5_1:
|
|
2044
|
+
case GGML_TYPE_Q8_0:
|
|
2045
|
+
case GGML_TYPE_Q2_K:
|
|
2046
|
+
case GGML_TYPE_Q3_K:
|
|
2047
|
+
case GGML_TYPE_Q4_K:
|
|
2048
|
+
case GGML_TYPE_Q5_K:
|
|
2049
|
+
case GGML_TYPE_Q6_K:
|
|
2050
|
+
break;
|
|
2051
|
+
default:
|
|
2052
|
+
return nullptr;
|
|
2053
|
+
}
|
|
2054
|
+
|
|
2055
|
+
return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type];
|
|
2056
|
+
}
|
|
2057
|
+
|
|
1952
2058
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
|
1953
2059
|
#ifdef GGML_VULKAN_DEBUG
|
|
1954
2060
|
std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl;
|
|
@@ -1984,13 +2090,14 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
|
|
1984
2090
|
return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type];
|
|
1985
2091
|
}
|
|
1986
2092
|
|
|
1987
|
-
static vk_pipeline
|
|
2093
|
+
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
|
1988
2094
|
#ifdef GGML_VULKAN_DEBUG
|
|
1989
2095
|
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
|
1990
2096
|
#endif
|
|
1991
|
-
GGML_ASSERT(b_type == GGML_TYPE_F32
|
|
2097
|
+
GGML_ASSERT(b_type == GGML_TYPE_F32);
|
|
1992
2098
|
|
|
1993
2099
|
switch (a_type) {
|
|
2100
|
+
case GGML_TYPE_F32:
|
|
1994
2101
|
case GGML_TYPE_F16:
|
|
1995
2102
|
case GGML_TYPE_Q4_0:
|
|
1996
2103
|
case GGML_TYPE_Q4_1:
|
|
@@ -2007,7 +2114,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
|
|
2007
2114
|
return nullptr;
|
|
2008
2115
|
}
|
|
2009
2116
|
|
|
2010
|
-
return
|
|
2117
|
+
return ctx->device->pipeline_dequant_mul_mat_vec_id_f32[a_type];
|
|
2011
2118
|
}
|
|
2012
2119
|
|
|
2013
2120
|
static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
|
@@ -2155,7 +2262,11 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context
|
|
|
2155
2262
|
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
|
2156
2263
|
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
|
2157
2264
|
#ifdef GGML_VULKAN_DEBUG
|
|
2158
|
-
std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ",
|
|
2265
|
+
std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
|
2266
|
+
for (auto& buffer : buffers) {
|
|
2267
|
+
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
|
|
2268
|
+
}
|
|
2269
|
+
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
|
|
2159
2270
|
#endif
|
|
2160
2271
|
std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
|
|
2161
2272
|
std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
|
|
@@ -2736,22 +2847,21 @@ static void ggml_vk_matmul(
|
|
|
2736
2847
|
ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline,
|
|
2737
2848
|
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer,
|
|
2738
2849
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
|
2739
|
-
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3,
|
|
2740
2850
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
|
2741
|
-
uint32_t
|
|
2851
|
+
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
|
|
2742
2852
|
#ifdef GGML_VULKAN_DEBUG
|
|
2743
|
-
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "),
|
|
2853
|
+
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")" << std::endl;
|
|
2744
2854
|
#endif
|
|
2745
2855
|
ggml_vk_sync_buffers(subctx);
|
|
2746
2856
|
if (split_k == 1) {
|
|
2747
|
-
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d,
|
|
2857
|
+
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
|
|
2748
2858
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch });
|
|
2749
2859
|
return;
|
|
2750
2860
|
}
|
|
2751
2861
|
|
|
2752
2862
|
GGML_ASSERT(batch_stride_d == m * n);
|
|
2753
2863
|
|
|
2754
|
-
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3
|
|
2864
|
+
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3 };
|
|
2755
2865
|
// Make sure enough workgroups get assigned for split k to work
|
|
2756
2866
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
|
2757
2867
|
ggml_vk_sync_buffers(subctx);
|
|
@@ -2761,29 +2871,20 @@ static void ggml_vk_matmul(
|
|
|
2761
2871
|
|
|
2762
2872
|
static void ggml_vk_matmul_id(
|
|
2763
2873
|
ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline,
|
|
2764
|
-
vk_subbuffer&&
|
|
2874
|
+
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids,
|
|
2765
2875
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
|
2766
|
-
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3,
|
|
2767
2876
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
|
2768
|
-
uint32_t
|
|
2877
|
+
uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
|
|
2769
2878
|
#ifdef GGML_VULKAN_DEBUG
|
|
2770
|
-
std::cerr << "
|
|
2879
|
+
std::cerr << "ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
|
2880
|
+
"m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
|
|
2881
|
+
"batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
|
|
2882
|
+
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")" << std::endl;
|
|
2771
2883
|
#endif
|
|
2772
2884
|
ggml_vk_sync_buffers(subctx);
|
|
2773
|
-
|
|
2774
|
-
|
|
2775
|
-
|
|
2776
|
-
return;
|
|
2777
|
-
}
|
|
2778
|
-
|
|
2779
|
-
GGML_ASSERT(batch_stride_d == m * n);
|
|
2780
|
-
|
|
2781
|
-
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d, expert_stride_b, expert_stride_d, idx, nbi1, n_as };
|
|
2782
|
-
// Make sure enough workgroups get assigned for split k to work
|
|
2783
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { ids, b, split_k_buffer, a }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
|
2784
|
-
ggml_vk_sync_buffers(subctx);
|
|
2785
|
-
const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
|
|
2786
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
|
|
2885
|
+
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
|
2886
|
+
nei0, nei1, nbi1, ne11 };
|
|
2887
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as });
|
|
2787
2888
|
}
|
|
2788
2889
|
|
|
2789
2890
|
static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
|
|
@@ -2908,7 +3009,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2908
3009
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
2909
3010
|
|
|
2910
3011
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
2911
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3012
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
2912
3013
|
GGML_ASSERT(d_D != nullptr);
|
|
2913
3014
|
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
|
2914
3015
|
vk_buffer d_X;
|
|
@@ -2917,12 +3018,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2917
3018
|
uint64_t y_buf_offset = 0;
|
|
2918
3019
|
if (!src0_uma) {
|
|
2919
3020
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
2920
|
-
qx_buf_offset = extra_src0->offset;
|
|
3021
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
2921
3022
|
GGML_ASSERT(d_Qx != nullptr);
|
|
2922
3023
|
}
|
|
2923
3024
|
if (!src1_uma) {
|
|
2924
3025
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
2925
|
-
qy_buf_offset = extra_src1->offset;
|
|
3026
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
2926
3027
|
GGML_ASSERT(d_Qy != nullptr);
|
|
2927
3028
|
}
|
|
2928
3029
|
if (qx_needs_dequant) {
|
|
@@ -2997,8 +3098,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2997
3098
|
ctx, subctx, pipeline,
|
|
2998
3099
|
{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 },
|
|
2999
3100
|
{ d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
|
|
3000
|
-
ne01, ne11, ne10,
|
|
3001
|
-
|
|
3101
|
+
ne01, ne11, ne10,
|
|
3102
|
+
ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21,
|
|
3103
|
+
split_k, ne12*ne13, ne02, ne12, r2, r3
|
|
3002
3104
|
); // NOLINT
|
|
3003
3105
|
}
|
|
3004
3106
|
|
|
@@ -3072,7 +3174,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3072
3174
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3073
3175
|
|
|
3074
3176
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3075
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3177
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3076
3178
|
GGML_ASSERT(d_D != nullptr);
|
|
3077
3179
|
vk_buffer d_X;
|
|
3078
3180
|
uint64_t x_buf_offset = 0;
|
|
@@ -3080,12 +3182,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3080
3182
|
uint64_t y_buf_offset = 0;
|
|
3081
3183
|
if(!src0_uma) {
|
|
3082
3184
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
3083
|
-
qx_buf_offset = extra_src0->offset;
|
|
3185
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3084
3186
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3085
3187
|
}
|
|
3086
3188
|
if(!src1_uma) {
|
|
3087
3189
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3088
|
-
qy_buf_offset = extra_src1->offset;
|
|
3190
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3089
3191
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3090
3192
|
}
|
|
3091
3193
|
if (qx_needs_dequant) {
|
|
@@ -3150,8 +3252,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3150
3252
|
// compute
|
|
3151
3253
|
const vk_mat_vec_push_constants pc = {
|
|
3152
3254
|
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
|
3153
|
-
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
|
3154
3255
|
stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21),
|
|
3256
|
+
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
|
3155
3257
|
};
|
|
3156
3258
|
ggml_vk_sync_buffers(subctx);
|
|
3157
3259
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { (uint32_t)ne01, (uint32_t)(ne12 * ne13), 1});
|
|
@@ -3204,14 +3306,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
3204
3306
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3205
3307
|
|
|
3206
3308
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3207
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3309
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3208
3310
|
GGML_ASSERT(d_D != nullptr);
|
|
3209
3311
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
|
3210
|
-
const uint64_t qx_buf_offset = extra_src0->offset;
|
|
3312
|
+
const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3211
3313
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3212
3314
|
if (!src1_uma) {
|
|
3213
3315
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3214
|
-
qy_buf_offset = extra_src1->offset;
|
|
3316
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3215
3317
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3216
3318
|
}
|
|
3217
3319
|
|
|
@@ -3281,14 +3383,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
3281
3383
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3282
3384
|
|
|
3283
3385
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3284
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3386
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3285
3387
|
GGML_ASSERT(d_D != nullptr);
|
|
3286
3388
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
|
3287
|
-
const uint64_t qx_buf_offset = extra_src0->offset;
|
|
3389
|
+
const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3288
3390
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3289
3391
|
if (!src1_uma) {
|
|
3290
3392
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3291
|
-
qy_buf_offset = extra_src1->offset;
|
|
3393
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3292
3394
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3293
3395
|
}
|
|
3294
3396
|
|
|
@@ -3311,26 +3413,26 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
3311
3413
|
#ifdef GGML_VULKAN_DEBUG
|
|
3312
3414
|
std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
|
|
3313
3415
|
#endif
|
|
3314
|
-
if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
|
|
3416
|
+
if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
|
|
3315
3417
|
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
|
|
3316
|
-
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) &&
|
|
3418
|
+
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
|
|
3317
3419
|
ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst);
|
|
3318
|
-
} else if (
|
|
3420
|
+
} else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
|
3319
3421
|
ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst);
|
|
3320
3422
|
} else {
|
|
3321
3423
|
ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst);
|
|
3322
3424
|
}
|
|
3323
3425
|
}
|
|
3324
3426
|
|
|
3325
|
-
|
|
3427
|
+
static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
|
3326
3428
|
#ifdef GGML_VULKAN_DEBUG
|
|
3327
|
-
std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
|
3328
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
|
3329
|
-
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ",
|
|
3330
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
|
3429
|
+
std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3430
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3431
|
+
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
|
3432
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
|
3331
3433
|
#endif
|
|
3332
|
-
GGML_ASSERT(src0->type == GGML_TYPE_I32);
|
|
3333
3434
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
3435
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
|
3334
3436
|
|
|
3335
3437
|
const uint64_t ne00 = src0->ne[0];
|
|
3336
3438
|
const uint64_t ne01 = src0->ne[1];
|
|
@@ -3342,16 +3444,18 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
3342
3444
|
const uint64_t ne12 = src1->ne[2];
|
|
3343
3445
|
const uint64_t ne13 = src1->ne[3];
|
|
3344
3446
|
|
|
3345
|
-
const
|
|
3447
|
+
const uint64_t nei0 = ids->ne[0];
|
|
3448
|
+
const uint64_t nei1 = ids->ne[1];
|
|
3449
|
+
GGML_ASSERT(nei0 * nei1 <= 2048);
|
|
3450
|
+
|
|
3451
|
+
const uint32_t nbi1 = ids->nb[1];
|
|
3452
|
+
const uint32_t nbi2 = ids->nb[2];
|
|
3346
3453
|
|
|
3347
3454
|
const uint64_t ne20 = dst->ne[0];
|
|
3348
3455
|
const uint64_t ne21 = dst->ne[1];
|
|
3456
|
+
const uint64_t ne22 = dst->ne[2];
|
|
3457
|
+
const uint64_t ne23 = dst->ne[3];
|
|
3349
3458
|
|
|
3350
|
-
const uint64_t r2 = ne12 / ne02;
|
|
3351
|
-
const uint64_t r3 = ne13 / ne03;
|
|
3352
|
-
|
|
3353
|
-
const uint32_t nbi1 = src0->nb[1];
|
|
3354
|
-
const uint32_t idx = ((uint32_t *) dst->op_params)[0];
|
|
3355
3459
|
const uint64_t n_as = ne02;
|
|
3356
3460
|
|
|
3357
3461
|
GGML_ASSERT(n_as <= 8);
|
|
@@ -3365,15 +3469,20 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
3365
3469
|
size_t qx_buf_offset = 0;
|
|
3366
3470
|
vk_buffer d_Qy;
|
|
3367
3471
|
size_t qy_buf_offset = 0;
|
|
3472
|
+
vk_buffer d_ids;
|
|
3473
|
+
size_t ids_buf_offset = 0;
|
|
3368
3474
|
|
|
3369
3475
|
bool src0_uma = false;
|
|
3370
3476
|
bool src1_uma = false;
|
|
3477
|
+
bool ids_uma = false;
|
|
3371
3478
|
|
|
3372
3479
|
if (ctx->device->uma) {
|
|
3373
3480
|
ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
|
|
3374
3481
|
ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
|
|
3482
|
+
ggml_vk_host_get(ctx, ids->data, d_ids, ids_buf_offset);
|
|
3375
3483
|
src0_uma = d_Qx != nullptr;
|
|
3376
3484
|
src1_uma = d_Qy != nullptr;
|
|
3485
|
+
ids_uma = d_ids != nullptr;
|
|
3377
3486
|
}
|
|
3378
3487
|
|
|
3379
3488
|
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
|
@@ -3393,41 +3502,44 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
3393
3502
|
// Not implemented
|
|
3394
3503
|
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
|
3395
3504
|
|
|
3396
|
-
const
|
|
3397
|
-
const
|
|
3398
|
-
const
|
|
3399
|
-
|
|
3400
|
-
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
|
|
3401
|
-
const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
|
|
3505
|
+
const uint64_t x_ne = ne01 * ne00;
|
|
3506
|
+
const uint64_t y_ne = ne11 * ne10;
|
|
3507
|
+
const uint64_t d_ne = ne21 * ne20;
|
|
3402
3508
|
|
|
3403
|
-
const uint32_t
|
|
3509
|
+
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, nei1));
|
|
3510
|
+
const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8;
|
|
3404
3511
|
|
|
3405
|
-
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01,
|
|
3512
|
+
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, nei1, aligned);
|
|
3406
3513
|
|
|
3407
3514
|
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
|
|
3408
3515
|
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
|
3409
3516
|
const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
|
|
3410
3517
|
const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
|
3518
|
+
const uint64_t ids_sz = nbi2;
|
|
3411
3519
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3412
3520
|
|
|
3413
3521
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3414
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3522
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3415
3523
|
GGML_ASSERT(d_D != nullptr);
|
|
3416
|
-
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
|
3417
3524
|
vk_buffer d_X;
|
|
3418
3525
|
uint64_t x_buf_offset = 0;
|
|
3419
3526
|
vk_buffer d_Y;
|
|
3420
3527
|
uint64_t y_buf_offset = 0;
|
|
3421
3528
|
if (!src0_uma) {
|
|
3422
3529
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
3423
|
-
qx_buf_offset = extra_src0->offset;
|
|
3530
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3424
3531
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3425
3532
|
}
|
|
3426
3533
|
if (!src1_uma) {
|
|
3427
3534
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3428
|
-
qy_buf_offset = extra_src1->offset;
|
|
3535
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3429
3536
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3430
3537
|
}
|
|
3538
|
+
if (!ids_uma) {
|
|
3539
|
+
d_ids = extra_ids->buffer_gpu.lock();
|
|
3540
|
+
ids_buf_offset = extra_ids->offset + ids->view_offs;
|
|
3541
|
+
GGML_ASSERT(d_ids != nullptr);
|
|
3542
|
+
}
|
|
3431
3543
|
if (qx_needs_dequant) {
|
|
3432
3544
|
d_X = ctx->prealloc_x;
|
|
3433
3545
|
GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03);
|
|
@@ -3469,9 +3581,6 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
3469
3581
|
if (qy_needs_dequant) {
|
|
3470
3582
|
ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
|
3471
3583
|
}
|
|
3472
|
-
if (split_k > 1) {
|
|
3473
|
-
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
|
3474
|
-
}
|
|
3475
3584
|
|
|
3476
3585
|
if (x_non_contig) {
|
|
3477
3586
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
@@ -3496,23 +3605,26 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
3496
3605
|
}
|
|
3497
3606
|
|
|
3498
3607
|
// compute
|
|
3499
|
-
|
|
3608
|
+
ggml_vk_matmul_id(
|
|
3500
3609
|
ctx, subctx, pipeline,
|
|
3501
3610
|
{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 },
|
|
3502
|
-
{ d_D, d_buf_offset, d_sz *
|
|
3503
|
-
ne01,
|
|
3504
|
-
|
|
3611
|
+
{ d_D, d_buf_offset, d_sz * ne22 * ne23 }, { d_ids, ids_buf_offset, ids_sz },
|
|
3612
|
+
ne01, ne21, ne10, ne10, ne10, ne01,
|
|
3613
|
+
stride_batch_x, stride_batch_y, ne20*ne21,
|
|
3614
|
+
n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11
|
|
3505
3615
|
); // NOLINT
|
|
3506
3616
|
}
|
|
3507
3617
|
|
|
3508
|
-
static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3618
|
+
static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
|
3509
3619
|
#ifdef GGML_VULKAN_DEBUG
|
|
3510
|
-
std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
|
3511
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
|
3512
|
-
std::cerr << "), (" <<
|
|
3620
|
+
std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
3621
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
3622
|
+
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
|
3623
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
|
3513
3624
|
#endif
|
|
3514
3625
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
|
3515
3626
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
3627
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
|
3516
3628
|
|
|
3517
3629
|
const uint64_t ne00 = src0->ne[0];
|
|
3518
3630
|
const uint64_t ne01 = src0->ne[1];
|
|
@@ -3524,36 +3636,41 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3524
3636
|
const uint64_t ne12 = src1->ne[2];
|
|
3525
3637
|
const uint64_t ne13 = src1->ne[3];
|
|
3526
3638
|
|
|
3527
|
-
|
|
3639
|
+
const uint64_t nei0 = ids->ne[0];
|
|
3640
|
+
const uint64_t nei1 = ids->ne[1];
|
|
3641
|
+
|
|
3642
|
+
const uint64_t nbi2 = ids->nb[2];
|
|
3643
|
+
|
|
3644
|
+
GGML_ASSERT(nei1 == 1);
|
|
3528
3645
|
|
|
3529
3646
|
const uint64_t ne20 = dst->ne[0];
|
|
3530
3647
|
const uint64_t ne21 = dst->ne[1];
|
|
3531
3648
|
const uint64_t ne22 = dst->ne[2];
|
|
3532
3649
|
const uint64_t ne23 = dst->ne[3];
|
|
3533
3650
|
|
|
3534
|
-
const uint64_t nb22 = dst->nb[2];
|
|
3535
|
-
const uint64_t nb23 = dst->nb[3];
|
|
3536
|
-
|
|
3537
|
-
const uint64_t r2 = ne12 / ne02;
|
|
3538
|
-
const uint64_t r3 = ne13 / ne03;
|
|
3539
|
-
|
|
3540
3651
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
3541
3652
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
|
3542
3653
|
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
|
3654
|
+
ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra;
|
|
3543
3655
|
|
|
3544
3656
|
vk_buffer d_Qx;
|
|
3545
3657
|
size_t qx_buf_offset = 0;
|
|
3546
3658
|
vk_buffer d_Qy;
|
|
3547
3659
|
size_t qy_buf_offset = 0;
|
|
3660
|
+
vk_buffer d_ids;
|
|
3661
|
+
size_t ids_buf_offset = 0;
|
|
3548
3662
|
|
|
3549
3663
|
bool src0_uma = false;
|
|
3550
3664
|
bool src1_uma = false;
|
|
3665
|
+
bool ids_uma = false;
|
|
3551
3666
|
|
|
3552
3667
|
if (ctx->device->uma) {
|
|
3553
3668
|
ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
|
|
3554
3669
|
ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
|
|
3670
|
+
ggml_vk_host_get(ctx, ids->data, d_ids, ids_buf_offset);
|
|
3555
3671
|
src0_uma = d_Qx != nullptr;
|
|
3556
3672
|
src1_uma = d_Qy != nullptr;
|
|
3673
|
+
ids_uma = d_ids != nullptr;
|
|
3557
3674
|
}
|
|
3558
3675
|
|
|
3559
3676
|
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
|
@@ -3569,16 +3686,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3569
3686
|
|
|
3570
3687
|
const uint64_t x_ne = ne01 * ne00;
|
|
3571
3688
|
const uint64_t y_ne = ne11 * ne10;
|
|
3572
|
-
const uint64_t d_ne =
|
|
3689
|
+
const uint64_t d_ne = ne21 * ne20;
|
|
3573
3690
|
|
|
3574
3691
|
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
|
3575
3692
|
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
|
3576
3693
|
const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
|
|
3577
3694
|
const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
|
3695
|
+
const uint64_t ids_sz = nbi2;
|
|
3578
3696
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3579
3697
|
|
|
3580
3698
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3581
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3699
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3582
3700
|
GGML_ASSERT(d_D != nullptr);
|
|
3583
3701
|
vk_buffer d_X;
|
|
3584
3702
|
uint64_t x_buf_offset = 0;
|
|
@@ -3586,14 +3704,19 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3586
3704
|
uint64_t y_buf_offset = 0;
|
|
3587
3705
|
if(!src0_uma) {
|
|
3588
3706
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
3589
|
-
qx_buf_offset = extra_src0->offset;
|
|
3707
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3590
3708
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3591
3709
|
}
|
|
3592
3710
|
if(!src1_uma) {
|
|
3593
3711
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3594
|
-
qy_buf_offset = extra_src1->offset;
|
|
3712
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3595
3713
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3596
3714
|
}
|
|
3715
|
+
if(!ids_uma) {
|
|
3716
|
+
d_ids = extra_ids->buffer_gpu.lock();
|
|
3717
|
+
ids_buf_offset = extra_ids->offset + ids->view_offs;
|
|
3718
|
+
GGML_ASSERT(d_ids != nullptr);
|
|
3719
|
+
}
|
|
3597
3720
|
if (qx_needs_dequant) {
|
|
3598
3721
|
d_X = ctx->prealloc_x;
|
|
3599
3722
|
} else {
|
|
@@ -3619,7 +3742,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3619
3742
|
} else {
|
|
3620
3743
|
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
|
|
3621
3744
|
}
|
|
3622
|
-
vk_pipeline dmmv =
|
|
3745
|
+
vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type);
|
|
3623
3746
|
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
|
3624
3747
|
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
|
3625
3748
|
GGML_ASSERT(dmmv != nullptr);
|
|
@@ -3642,27 +3765,34 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3642
3765
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
3643
3766
|
}
|
|
3644
3767
|
|
|
3645
|
-
uint32_t stride_batch_x = ne00*ne01;
|
|
3646
3768
|
uint32_t stride_batch_y = ne10*ne11;
|
|
3647
3769
|
|
|
3648
|
-
if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
|
|
3649
|
-
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
|
3650
|
-
}
|
|
3651
|
-
|
|
3652
3770
|
if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
|
|
3653
3771
|
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
|
3654
3772
|
}
|
|
3655
3773
|
|
|
3656
3774
|
// compute
|
|
3657
|
-
const
|
|
3775
|
+
const vk_mat_vec_id_push_constants pc = {
|
|
3658
3776
|
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
|
3659
|
-
(uint32_t)
|
|
3660
|
-
|
|
3661
|
-
// 0, 0, 0, 0, 1
|
|
3777
|
+
(uint32_t)x_ne, stride_batch_y, (uint32_t)(ne20*ne21),
|
|
3778
|
+
(uint32_t)nei0, (uint32_t)ne11,
|
|
3662
3779
|
};
|
|
3663
3780
|
ggml_vk_sync_buffers(subctx);
|
|
3664
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
3665
|
-
}
|
|
3781
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
3782
|
+
{ { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } },
|
|
3783
|
+
sizeof(vk_mat_vec_id_push_constants), &pc, { (uint32_t)ne01, (uint32_t)nei0, 1 });
|
|
3784
|
+
}
|
|
3785
|
+
|
|
3786
|
+
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
3787
|
+
#ifdef GGML_VULKAN_DEBUG
|
|
3788
|
+
std::cerr << "ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")" << std::endl;
|
|
3789
|
+
#endif
|
|
3790
|
+
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
|
3791
|
+
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
|
|
3792
|
+
} else {
|
|
3793
|
+
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst);
|
|
3794
|
+
}
|
|
3795
|
+
}
|
|
3666
3796
|
|
|
3667
3797
|
static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3668
3798
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
|
@@ -3699,9 +3829,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
|
3699
3829
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
|
3700
3830
|
|
|
3701
3831
|
const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
|
|
3702
|
-
const uint64_t src_offset = extra_src0->offset;
|
|
3832
|
+
const uint64_t src_offset = extra_src0->offset + src0->view_offs;
|
|
3703
3833
|
vk_buffer dst_buf = extra->buffer_gpu.lock();
|
|
3704
|
-
const uint64_t dst_offset = extra->offset;
|
|
3834
|
+
const uint64_t dst_offset = extra->offset + dst->view_offs;
|
|
3705
3835
|
|
|
3706
3836
|
std::vector<vk::BufferCopy> copies;
|
|
3707
3837
|
|
|
@@ -3754,6 +3884,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3754
3884
|
return ctx->device->pipeline_mul_f32;
|
|
3755
3885
|
}
|
|
3756
3886
|
return nullptr;
|
|
3887
|
+
case GGML_OP_DIV:
|
|
3888
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3889
|
+
return ctx->device->pipeline_div_f32;
|
|
3890
|
+
}
|
|
3891
|
+
return nullptr;
|
|
3757
3892
|
case GGML_OP_SCALE:
|
|
3758
3893
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3759
3894
|
return ctx->device->pipeline_scale_f32;
|
|
@@ -3823,11 +3958,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3823
3958
|
{
|
|
3824
3959
|
const int mode = ((const int32_t *) dst->op_params)[2];
|
|
3825
3960
|
const bool is_neox = mode & 2;
|
|
3826
|
-
const bool is_glm = mode & 4;
|
|
3827
|
-
|
|
3828
|
-
if (is_glm) {
|
|
3829
|
-
return nullptr;
|
|
3830
|
-
}
|
|
3831
3961
|
|
|
3832
3962
|
if (is_neox) {
|
|
3833
3963
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
@@ -3838,10 +3968,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3838
3968
|
}
|
|
3839
3969
|
} else {
|
|
3840
3970
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3841
|
-
return ctx->device->
|
|
3971
|
+
return ctx->device->pipeline_rope_norm_f32;
|
|
3842
3972
|
}
|
|
3843
3973
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
3844
|
-
return ctx->device->
|
|
3974
|
+
return ctx->device->pipeline_rope_norm_f16;
|
|
3845
3975
|
}
|
|
3846
3976
|
}
|
|
3847
3977
|
return nullptr;
|
|
@@ -3851,6 +3981,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3851
3981
|
return ctx->device->pipeline_argsort_f32;
|
|
3852
3982
|
}
|
|
3853
3983
|
return nullptr;
|
|
3984
|
+
case GGML_OP_SUM_ROWS:
|
|
3985
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3986
|
+
return ctx->device->pipeline_sum_rows_f32;
|
|
3987
|
+
}
|
|
3988
|
+
return nullptr;
|
|
3854
3989
|
default:
|
|
3855
3990
|
return nullptr;
|
|
3856
3991
|
}
|
|
@@ -3873,6 +4008,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
|
3873
4008
|
case GGML_OP_GET_ROWS:
|
|
3874
4009
|
case GGML_OP_ADD:
|
|
3875
4010
|
case GGML_OP_MUL:
|
|
4011
|
+
case GGML_OP_DIV:
|
|
3876
4012
|
case GGML_OP_SCALE:
|
|
3877
4013
|
case GGML_OP_SQR:
|
|
3878
4014
|
case GGML_OP_CLAMP:
|
|
@@ -3895,7 +4031,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
3895
4031
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
|
3896
4032
|
#endif
|
|
3897
4033
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
|
3898
|
-
GGML_ASSERT(op
|
|
4034
|
+
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
|
3899
4035
|
GGML_ASSERT(dst->extra != nullptr);
|
|
3900
4036
|
const uint64_t ne00 = src0->ne[0];
|
|
3901
4037
|
const uint64_t ne01 = src0->ne[1];
|
|
@@ -3918,6 +4054,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
3918
4054
|
const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
|
|
3919
4055
|
const uint64_t ne2 = ne20 * ne21;
|
|
3920
4056
|
|
|
4057
|
+
const uint64_t ned0 = dst->ne[0];
|
|
4058
|
+
const uint64_t ned1 = dst->ne[1];
|
|
4059
|
+
const uint64_t ned2 = dst->ne[2];
|
|
4060
|
+
const uint64_t ned3 = dst->ne[3];
|
|
4061
|
+
const uint64_t ned = ned0 * ned1;
|
|
4062
|
+
|
|
3921
4063
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
|
3922
4064
|
ggml_vk_func_t op_func;
|
|
3923
4065
|
|
|
@@ -3967,10 +4109,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
3967
4109
|
}
|
|
3968
4110
|
}
|
|
3969
4111
|
|
|
3970
|
-
uint64_t x_sz =
|
|
3971
|
-
uint64_t y_sz = use_src1 ?
|
|
3972
|
-
uint64_t z_sz = use_src2 ?
|
|
3973
|
-
uint64_t d_sz = ggml_type_size(dst->type) *
|
|
4112
|
+
uint64_t x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0;
|
|
4113
|
+
uint64_t y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 : 0;
|
|
4114
|
+
uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0;
|
|
4115
|
+
uint64_t d_sz = ggml_type_size(dst->type) * ned;
|
|
3974
4116
|
|
|
3975
4117
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3976
4118
|
|
|
@@ -3980,21 +4122,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
3980
4122
|
}
|
|
3981
4123
|
|
|
3982
4124
|
GGML_ASSERT(d_D != nullptr);
|
|
3983
|
-
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
|
4125
|
+
uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
|
3984
4126
|
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
|
3985
4127
|
if(!src0_uma) {
|
|
3986
4128
|
d_X = extra_src0->buffer_gpu.lock();
|
|
3987
|
-
x_buf_offset = extra_src0->offset;
|
|
4129
|
+
x_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3988
4130
|
GGML_ASSERT(d_X != nullptr);
|
|
3989
4131
|
}
|
|
3990
4132
|
if (use_src1 && !src1_uma) {
|
|
3991
4133
|
d_Y = extra_src1->buffer_gpu.lock();
|
|
3992
|
-
y_buf_offset = extra_src1->offset;
|
|
4134
|
+
y_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3993
4135
|
GGML_ASSERT(d_Y != nullptr);
|
|
3994
4136
|
}
|
|
3995
4137
|
if (use_src2 && !src2_uma) {
|
|
3996
4138
|
d_Z = extra_src2->buffer_gpu.lock();
|
|
3997
|
-
z_buf_offset = extra_src2->offset;
|
|
4139
|
+
z_buf_offset = extra_src2->offset + src2->view_offs;
|
|
3998
4140
|
GGML_ASSERT(d_Z != nullptr);
|
|
3999
4141
|
}
|
|
4000
4142
|
|
|
@@ -4028,6 +4170,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4028
4170
|
case GGML_OP_NORM:
|
|
4029
4171
|
case GGML_OP_RMS_NORM:
|
|
4030
4172
|
case GGML_OP_SOFT_MAX:
|
|
4173
|
+
case GGML_OP_SUM_ROWS:
|
|
4031
4174
|
elements = { (uint32_t)ggml_nrows(src0), 1, 1 };
|
|
4032
4175
|
break;
|
|
4033
4176
|
case GGML_OP_DIAG_MASK_INF:
|
|
@@ -4056,7 +4199,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4056
4199
|
z_sz *= ne22 * ne23;
|
|
4057
4200
|
}
|
|
4058
4201
|
if (d_sz != VK_WHOLE_SIZE) {
|
|
4059
|
-
d_sz *=
|
|
4202
|
+
d_sz *= ned2 * ned3;
|
|
4060
4203
|
}
|
|
4061
4204
|
}
|
|
4062
4205
|
|
|
@@ -4072,24 +4215,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4072
4215
|
ggml_vk_sync_buffers(subctx);
|
|
4073
4216
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4074
4217
|
} else if (op == GGML_OP_ROPE) {
|
|
4075
|
-
|
|
4076
|
-
|
|
4077
|
-
|
|
4078
|
-
|
|
4079
|
-
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
4080
|
-
vk_subbuffer subbuf_z;
|
|
4081
|
-
if (use_src2) {
|
|
4082
|
-
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
|
4083
|
-
} else {
|
|
4084
|
-
subbuf_z = { d_X, 0, d_X->size };
|
|
4085
|
-
}
|
|
4086
|
-
|
|
4087
|
-
ggml_vk_sync_buffers(subctx);
|
|
4088
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4218
|
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
4219
|
+
vk_subbuffer subbuf_z;
|
|
4220
|
+
if (use_src2) {
|
|
4221
|
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
|
4089
4222
|
} else {
|
|
4090
|
-
|
|
4091
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4223
|
+
subbuf_z = { d_X, 0, d_X->size };
|
|
4092
4224
|
}
|
|
4225
|
+
|
|
4226
|
+
ggml_vk_sync_buffers(subctx);
|
|
4227
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4093
4228
|
} else if (use_src2) {
|
|
4094
4229
|
ggml_vk_sync_buffers(subctx);
|
|
4095
4230
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
@@ -4193,6 +4328,21 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
|
4193
4328
|
});
|
|
4194
4329
|
}
|
|
4195
4330
|
|
|
4331
|
+
static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4332
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4333
|
+
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
4334
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4335
|
+
|
|
4336
|
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_DIV, {
|
|
4337
|
+
(uint32_t)ggml_nelements(src0),
|
|
4338
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
4339
|
+
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
4340
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
4341
|
+
0,
|
|
4342
|
+
0.0f, 0.0f,
|
|
4343
|
+
});
|
|
4344
|
+
}
|
|
4345
|
+
|
|
4196
4346
|
static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
4197
4347
|
float * op_params = (float *)dst->op_params;
|
|
4198
4348
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
@@ -4238,7 +4388,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
|
4238
4388
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
4239
4389
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4240
4390
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4241
|
-
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
|
4391
|
+
const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
|
4242
4392
|
|
|
4243
4393
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
|
4244
4394
|
(uint32_t)ggml_nelements(src0),
|
|
@@ -4296,9 +4446,9 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
4296
4446
|
|
|
4297
4447
|
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
4298
4448
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
4299
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
|
4449
|
+
// const int mode = ((int32_t *) dst->op_params)[2];
|
|
4300
4450
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
4301
|
-
const int
|
|
4451
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
4302
4452
|
const float freq_base = ((float *) dst->op_params)[5];
|
|
4303
4453
|
const float freq_scale = ((float *) dst->op_params)[6];
|
|
4304
4454
|
const float ext_factor = ((float *) dst->op_params)[7];
|
|
@@ -4306,28 +4456,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
|
4306
4456
|
const float beta_fast = ((float *) dst->op_params)[9];
|
|
4307
4457
|
const float beta_slow = ((float *) dst->op_params)[10];
|
|
4308
4458
|
|
|
4309
|
-
|
|
4310
|
-
|
|
4459
|
+
float corr_dims[2];
|
|
4460
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
4311
4461
|
|
|
4312
|
-
|
|
4462
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
4313
4463
|
|
|
4314
|
-
|
|
4315
|
-
|
|
4316
|
-
|
|
4317
|
-
|
|
4318
|
-
|
|
4319
|
-
const float inv_ndims = -1.0f / n_dims;
|
|
4320
|
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
|
4321
|
-
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
|
4322
|
-
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
|
|
4323
|
-
src2 != nullptr,
|
|
4324
|
-
});
|
|
4325
|
-
} else {
|
|
4326
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
|
4327
|
-
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
|
4328
|
-
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
|
4329
|
-
});
|
|
4330
|
-
}
|
|
4464
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
|
4465
|
+
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
|
4466
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
|
|
4467
|
+
src2 != nullptr,
|
|
4468
|
+
});
|
|
4331
4469
|
}
|
|
4332
4470
|
|
|
4333
4471
|
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -4342,10 +4480,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
4342
4480
|
|
|
4343
4481
|
GGML_ASSERT(ncols_pad <= 1024);
|
|
4344
4482
|
|
|
4345
|
-
std::cerr << "ncols=" << ncols << " ncols_pad=" << ncols_pad << " ascending=" << op_params[0] << std::endl;
|
|
4346
|
-
|
|
4347
|
-
std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
|
|
4348
|
-
|
|
4349
4483
|
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
|
|
4350
4484
|
ncols,
|
|
4351
4485
|
ncols_pad,
|
|
@@ -4353,6 +4487,10 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
4353
4487
|
});
|
|
4354
4488
|
}
|
|
4355
4489
|
|
|
4490
|
+
static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
4491
|
+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f });
|
|
4492
|
+
}
|
|
4493
|
+
|
|
4356
4494
|
#ifdef GGML_VULKAN_RUN_TESTS
|
|
4357
4495
|
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
|
|
4358
4496
|
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
|
|
@@ -4548,7 +4686,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
4548
4686
|
ggml_vk_ctx_begin(ctx, subctx);
|
|
4549
4687
|
ggml_vk_matmul(
|
|
4550
4688
|
ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
|
4551
|
-
m, n, k,
|
|
4689
|
+
m, n, k,
|
|
4690
|
+
k, k, m, k*m, k*n, m*n,
|
|
4691
|
+
split_k, batch, batch, batch, 1, 1
|
|
4552
4692
|
);
|
|
4553
4693
|
ggml_vk_ctx_end(subctx);
|
|
4554
4694
|
}
|
|
@@ -5052,7 +5192,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
5052
5192
|
ggml_vk_ctx_begin(ctx, subctx);
|
|
5053
5193
|
ggml_vk_matmul(
|
|
5054
5194
|
ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
|
5055
|
-
m, n, k,
|
|
5195
|
+
m, n, k,
|
|
5196
|
+
k, k, m, k*m, k*n, m*n,
|
|
5197
|
+
split_k, batch, batch, batch, 1, 1
|
|
5056
5198
|
);
|
|
5057
5199
|
ggml_vk_ctx_end(subctx);
|
|
5058
5200
|
}
|
|
@@ -5237,12 +5379,14 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
|
5237
5379
|
case GGML_OP_CONT:
|
|
5238
5380
|
case GGML_OP_DUP:
|
|
5239
5381
|
case GGML_OP_MUL:
|
|
5382
|
+
case GGML_OP_DIV:
|
|
5240
5383
|
case GGML_OP_NORM:
|
|
5241
5384
|
case GGML_OP_RMS_NORM:
|
|
5242
5385
|
case GGML_OP_DIAG_MASK_INF:
|
|
5243
5386
|
case GGML_OP_SOFT_MAX:
|
|
5244
5387
|
case GGML_OP_ROPE:
|
|
5245
5388
|
case GGML_OP_ARGSORT:
|
|
5389
|
+
case GGML_OP_SUM_ROWS:
|
|
5246
5390
|
break;
|
|
5247
5391
|
case GGML_OP_UNARY:
|
|
5248
5392
|
switch (ggml_get_unary_op(node)) {
|
|
@@ -5465,6 +5609,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5465
5609
|
const ggml_tensor * src2 = node->src[2];
|
|
5466
5610
|
|
|
5467
5611
|
switch (node->op) {
|
|
5612
|
+
// Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
|
|
5613
|
+
case GGML_OP_RESHAPE:
|
|
5614
|
+
case GGML_OP_VIEW:
|
|
5615
|
+
case GGML_OP_PERMUTE:
|
|
5616
|
+
case GGML_OP_TRANSPOSE:
|
|
5617
|
+
case GGML_OP_NONE:
|
|
5618
|
+
return;
|
|
5468
5619
|
case GGML_OP_UNARY:
|
|
5469
5620
|
switch (ggml_get_unary_op(node)) {
|
|
5470
5621
|
case GGML_UNARY_OP_SILU:
|
|
@@ -5479,16 +5630,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5479
5630
|
case GGML_OP_GET_ROWS:
|
|
5480
5631
|
case GGML_OP_ADD:
|
|
5481
5632
|
case GGML_OP_MUL:
|
|
5633
|
+
case GGML_OP_DIV:
|
|
5482
5634
|
case GGML_OP_SCALE:
|
|
5483
5635
|
case GGML_OP_SQR:
|
|
5484
5636
|
case GGML_OP_CLAMP:
|
|
5485
5637
|
case GGML_OP_CPY:
|
|
5486
5638
|
case GGML_OP_CONT:
|
|
5487
5639
|
case GGML_OP_DUP:
|
|
5488
|
-
case GGML_OP_RESHAPE:
|
|
5489
|
-
case GGML_OP_VIEW:
|
|
5490
|
-
case GGML_OP_PERMUTE:
|
|
5491
|
-
case GGML_OP_TRANSPOSE:
|
|
5492
5640
|
case GGML_OP_NORM:
|
|
5493
5641
|
case GGML_OP_RMS_NORM:
|
|
5494
5642
|
case GGML_OP_DIAG_MASK_INF:
|
|
@@ -5496,8 +5644,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5496
5644
|
case GGML_OP_ROPE:
|
|
5497
5645
|
case GGML_OP_MUL_MAT:
|
|
5498
5646
|
case GGML_OP_MUL_MAT_ID:
|
|
5499
|
-
case GGML_OP_NONE:
|
|
5500
5647
|
case GGML_OP_ARGSORT:
|
|
5648
|
+
case GGML_OP_SUM_ROWS:
|
|
5501
5649
|
break;
|
|
5502
5650
|
default:
|
|
5503
5651
|
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
|
@@ -5526,6 +5674,10 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5526
5674
|
case GGML_OP_MUL:
|
|
5527
5675
|
ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node);
|
|
5528
5676
|
|
|
5677
|
+
break;
|
|
5678
|
+
case GGML_OP_DIV:
|
|
5679
|
+
ggml_vk_div(ctx, ctx->compute_ctx, src0, src1, node);
|
|
5680
|
+
|
|
5529
5681
|
break;
|
|
5530
5682
|
case GGML_OP_SCALE:
|
|
5531
5683
|
ggml_vk_scale(ctx, ctx->compute_ctx, src0, node);
|
|
@@ -5544,12 +5696,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5544
5696
|
case GGML_OP_DUP:
|
|
5545
5697
|
ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
|
|
5546
5698
|
|
|
5547
|
-
break;
|
|
5548
|
-
case GGML_OP_RESHAPE:
|
|
5549
|
-
case GGML_OP_VIEW:
|
|
5550
|
-
case GGML_OP_PERMUTE:
|
|
5551
|
-
case GGML_OP_TRANSPOSE:
|
|
5552
|
-
case GGML_OP_NONE:
|
|
5553
5699
|
break;
|
|
5554
5700
|
case GGML_OP_NORM:
|
|
5555
5701
|
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
|
@@ -5584,22 +5730,24 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5584
5730
|
break;
|
|
5585
5731
|
case GGML_OP_ARGSORT:
|
|
5586
5732
|
ggml_vk_argsort(ctx, ctx->compute_ctx, src0, node);
|
|
5733
|
+
|
|
5734
|
+
break;
|
|
5735
|
+
case GGML_OP_SUM_ROWS:
|
|
5736
|
+
ggml_vk_sum_rows(ctx, ctx->compute_ctx, src0, node);
|
|
5737
|
+
|
|
5587
5738
|
break;
|
|
5588
5739
|
case GGML_OP_MUL_MAT:
|
|
5589
5740
|
ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node);
|
|
5590
5741
|
|
|
5591
5742
|
break;
|
|
5592
5743
|
case GGML_OP_MUL_MAT_ID:
|
|
5593
|
-
|
|
5594
|
-
std::cerr << "ggml_vulkan: GGML_OP_MUL_MAT_ID not implemented yet." << std::endl;
|
|
5595
|
-
GGML_ASSERT(false);
|
|
5744
|
+
ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, src2, node);
|
|
5596
5745
|
|
|
5597
5746
|
break;
|
|
5598
5747
|
default:
|
|
5599
5748
|
return;
|
|
5600
5749
|
}
|
|
5601
5750
|
|
|
5602
|
-
extra->ready = true;
|
|
5603
5751
|
extra->ctx_idx = ctx->compute_ctx->idx;
|
|
5604
5752
|
|
|
5605
5753
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
@@ -5622,6 +5770,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5622
5770
|
case GGML_OP_ADD:
|
|
5623
5771
|
case GGML_OP_GET_ROWS:
|
|
5624
5772
|
case GGML_OP_MUL:
|
|
5773
|
+
case GGML_OP_DIV:
|
|
5625
5774
|
case GGML_OP_SCALE:
|
|
5626
5775
|
case GGML_OP_SQR:
|
|
5627
5776
|
case GGML_OP_CLAMP:
|
|
@@ -5639,6 +5788,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5639
5788
|
case GGML_OP_TRANSPOSE:
|
|
5640
5789
|
case GGML_OP_NONE:
|
|
5641
5790
|
case GGML_OP_ARGSORT:
|
|
5791
|
+
case GGML_OP_SUM_ROWS:
|
|
5642
5792
|
extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
5643
5793
|
|
|
5644
5794
|
break;
|
|
@@ -5681,8 +5831,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5681
5831
|
ggml_vk_check_results_0(ctx, params, tensor);
|
|
5682
5832
|
#endif
|
|
5683
5833
|
|
|
5684
|
-
GGML_ASSERT(extra->ready);
|
|
5685
|
-
|
|
5686
5834
|
vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
|
|
5687
5835
|
|
|
5688
5836
|
// Only run if ctx hasn't been submitted yet
|
|
@@ -5707,8 +5855,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5707
5855
|
subctx.out_memcpys.clear();
|
|
5708
5856
|
}
|
|
5709
5857
|
|
|
5710
|
-
extra->ready = false;
|
|
5711
|
-
|
|
5712
5858
|
return true;
|
|
5713
5859
|
}
|
|
5714
5860
|
|
|
@@ -5828,7 +5974,9 @@ struct ggml_backend_vk_buffer_context {
|
|
|
5828
5974
|
|
|
5829
5975
|
~ggml_backend_vk_buffer_context() {
|
|
5830
5976
|
ggml_vk_destroy_buffer(dev_buffer);
|
|
5831
|
-
|
|
5977
|
+
if (temp_tensor_extras != nullptr) {
|
|
5978
|
+
delete[] temp_tensor_extras;
|
|
5979
|
+
}
|
|
5832
5980
|
}
|
|
5833
5981
|
|
|
5834
5982
|
ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
|
|
@@ -5875,18 +6023,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
|
5875
6023
|
#endif
|
|
5876
6024
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
5877
6025
|
|
|
5878
|
-
|
|
5879
|
-
if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
|
|
6026
|
+
if (tensor->view_src != nullptr) {
|
|
5880
6027
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
|
5881
|
-
|
|
5882
|
-
extra
|
|
5883
|
-
extra->offset = extra_view->offset + tensor->view_offs;
|
|
6028
|
+
GGML_ASSERT(tensor->view_src->extra != nullptr);
|
|
6029
|
+
tensor->extra = tensor->view_src->extra;
|
|
5884
6030
|
} else {
|
|
6031
|
+
ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
|
|
5885
6032
|
extra->buffer_gpu = ctx->dev_buffer;
|
|
5886
6033
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
|
6034
|
+
tensor->extra = extra;
|
|
5887
6035
|
}
|
|
5888
|
-
|
|
5889
|
-
tensor->extra = extra;
|
|
5890
6036
|
}
|
|
5891
6037
|
|
|
5892
6038
|
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
@@ -5899,7 +6045,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
|
5899
6045
|
|
|
5900
6046
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
5901
6047
|
|
|
5902
|
-
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
|
|
6048
|
+
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
5903
6049
|
}
|
|
5904
6050
|
|
|
5905
6051
|
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
@@ -5912,7 +6058,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
|
5912
6058
|
|
|
5913
6059
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
5914
6060
|
|
|
5915
|
-
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
|
|
6061
|
+
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
5916
6062
|
}
|
|
5917
6063
|
|
|
5918
6064
|
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
|
@@ -5923,7 +6069,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|
|
5923
6069
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
|
5924
6070
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
|
5925
6071
|
|
|
5926
|
-
ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
|
6072
|
+
ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
|
|
5927
6073
|
|
|
5928
6074
|
return true;
|
|
5929
6075
|
}
|
|
@@ -5967,7 +6113,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
|
|
5967
6113
|
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
|
5968
6114
|
#endif
|
|
5969
6115
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
|
5970
|
-
|
|
6116
|
+
|
|
6117
|
+
vk_buffer dev_buffer = nullptr;
|
|
6118
|
+
try {
|
|
6119
|
+
dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
|
|
6120
|
+
} catch (const vk::SystemError& e) {
|
|
6121
|
+
return nullptr;
|
|
6122
|
+
}
|
|
5971
6123
|
|
|
5972
6124
|
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
|
|
5973
6125
|
|
|
@@ -5990,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
|
|
|
5990
6142
|
UNUSED(buft);
|
|
5991
6143
|
}
|
|
5992
6144
|
|
|
5993
|
-
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
5994
|
-
if (!ggml_backend_is_vk(backend)) {
|
|
5995
|
-
return false;
|
|
5996
|
-
}
|
|
5997
|
-
|
|
5998
|
-
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
|
5999
|
-
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6000
|
-
|
|
6001
|
-
return buft_ctx->ctx->idx == ctx->idx;
|
|
6002
|
-
}
|
|
6003
|
-
|
|
6004
6145
|
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
6005
6146
|
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
|
6006
6147
|
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
|
6007
6148
|
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
|
6008
6149
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
|
6009
6150
|
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
|
6010
|
-
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
|
|
6011
6151
|
/* .is_host = */ NULL,
|
|
6012
6152
|
};
|
|
6013
6153
|
|
|
@@ -6083,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
|
6083
6223
|
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
|
6084
6224
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
6085
6225
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
6086
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
6087
6226
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
6088
6227
|
},
|
|
6089
6228
|
/* .context = */ nullptr,
|
|
@@ -6149,7 +6288,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
|
6149
6288
|
|
|
6150
6289
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
6151
6290
|
|
|
6152
|
-
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
|
|
6291
|
+
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6153
6292
|
}
|
|
6154
6293
|
|
|
6155
6294
|
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
@@ -6169,7 +6308,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
|
6169
6308
|
|
|
6170
6309
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
6171
6310
|
|
|
6172
|
-
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
|
|
6311
|
+
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6173
6312
|
}
|
|
6174
6313
|
|
|
6175
6314
|
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
|
@@ -6190,7 +6329,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
|
6190
6329
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
|
6191
6330
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
|
6192
6331
|
|
|
6193
|
-
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
|
6332
|
+
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
|
|
6194
6333
|
return true;
|
|
6195
6334
|
}
|
|
6196
6335
|
|
|
@@ -6287,13 +6426,13 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6287
6426
|
case GGML_UNARY_OP_GELU:
|
|
6288
6427
|
case GGML_UNARY_OP_SILU:
|
|
6289
6428
|
case GGML_UNARY_OP_RELU:
|
|
6290
|
-
return
|
|
6429
|
+
return ggml_is_contiguous(op->src[0]);
|
|
6291
6430
|
default:
|
|
6292
6431
|
return false;
|
|
6293
6432
|
}
|
|
6294
6433
|
break;
|
|
6295
6434
|
case GGML_OP_MUL_MAT:
|
|
6296
|
-
|
|
6435
|
+
case GGML_OP_MUL_MAT_ID:
|
|
6297
6436
|
{
|
|
6298
6437
|
switch (op->src[0]->type) {
|
|
6299
6438
|
case GGML_TYPE_F32:
|
|
@@ -6363,12 +6502,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6363
6502
|
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
|
6364
6503
|
// } break;
|
|
6365
6504
|
case GGML_OP_ROPE:
|
|
6366
|
-
|
|
6367
|
-
const int mode = ((const int32_t *) op->op_params)[2];
|
|
6368
|
-
const bool is_glm = mode & 4;
|
|
6369
|
-
|
|
6370
|
-
return !is_glm;
|
|
6371
|
-
} break;
|
|
6505
|
+
return ggml_is_contiguous(op->src[0]);
|
|
6372
6506
|
case GGML_OP_NONE:
|
|
6373
6507
|
case GGML_OP_RESHAPE:
|
|
6374
6508
|
case GGML_OP_VIEW:
|
|
@@ -6377,6 +6511,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6377
6511
|
case GGML_OP_NORM:
|
|
6378
6512
|
case GGML_OP_ADD:
|
|
6379
6513
|
case GGML_OP_MUL:
|
|
6514
|
+
case GGML_OP_DIV:
|
|
6380
6515
|
case GGML_OP_RMS_NORM:
|
|
6381
6516
|
case GGML_OP_SCALE:
|
|
6382
6517
|
case GGML_OP_SQR:
|
|
@@ -6385,6 +6520,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6385
6520
|
case GGML_OP_DIAG_MASK_INF:
|
|
6386
6521
|
case GGML_OP_SOFT_MAX:
|
|
6387
6522
|
case GGML_OP_ARGSORT:
|
|
6523
|
+
case GGML_OP_SUM_ROWS:
|
|
6388
6524
|
return true;
|
|
6389
6525
|
default:
|
|
6390
6526
|
return false;
|
|
@@ -6394,17 +6530,23 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6394
6530
|
}
|
|
6395
6531
|
|
|
6396
6532
|
GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
6397
|
-
const ggml_tensor * dst = op;
|
|
6398
|
-
|
|
6399
6533
|
const int min_batch_size = 32;
|
|
6400
6534
|
|
|
6401
|
-
|
|
6402
|
-
|
|
6535
|
+
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
|
6536
|
+
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
|
6537
|
+
|
|
6538
|
+
UNUSED(backend);
|
|
6539
|
+
}
|
|
6540
|
+
|
|
6541
|
+
GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
6542
|
+
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
|
6543
|
+
return false;
|
|
6403
6544
|
}
|
|
6404
6545
|
|
|
6405
|
-
|
|
6546
|
+
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
|
6547
|
+
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6406
6548
|
|
|
6407
|
-
|
|
6549
|
+
return buft_ctx->ctx->idx == ctx->idx;
|
|
6408
6550
|
}
|
|
6409
6551
|
|
|
6410
6552
|
// TODO: enable async and synchronize
|
|
@@ -6418,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
|
6418
6560
|
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
|
6419
6561
|
/* .graph_plan_create = */ NULL,
|
|
6420
6562
|
/* .graph_plan_free = */ NULL,
|
|
6563
|
+
/* .graph_plan_update = */ NULL,
|
|
6421
6564
|
/* .graph_plan_compute = */ NULL,
|
|
6422
6565
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
|
6423
6566
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
|
6567
|
+
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
|
6424
6568
|
/* .offload_op = */ ggml_backend_vk_offload_op,
|
|
6425
6569
|
/* .event_new = */ NULL,
|
|
6426
6570
|
/* .event_free = */ NULL,
|
|
@@ -6614,7 +6758,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
|
6614
6758
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
6615
6759
|
|
|
6616
6760
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6617
|
-
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
|
6761
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
|
6618
6762
|
}
|
|
6619
6763
|
|
|
6620
6764
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
|
@@ -6681,9 +6825,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6681
6825
|
size_t src1_size;
|
|
6682
6826
|
size_t src2_size;
|
|
6683
6827
|
|
|
6684
|
-
void * src0_buffer;
|
|
6685
|
-
void * src1_buffer;
|
|
6686
|
-
void * src2_buffer;
|
|
6828
|
+
void * src0_buffer = nullptr;
|
|
6829
|
+
void * src1_buffer = nullptr;
|
|
6830
|
+
void * src2_buffer = nullptr;
|
|
6687
6831
|
|
|
6688
6832
|
if (src0 != nullptr) {
|
|
6689
6833
|
src0_clone = ggml_dup_tensor(ggml_ctx, src0);
|
|
@@ -6698,7 +6842,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6698
6842
|
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
|
6699
6843
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
6700
6844
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6701
|
-
uint64_t offset = extra->offset;
|
|
6845
|
+
uint64_t offset = extra->offset + src0->view_offs;
|
|
6702
6846
|
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
|
6703
6847
|
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
|
6704
6848
|
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
|
@@ -6740,7 +6884,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6740
6884
|
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
|
6741
6885
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
6742
6886
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6743
|
-
uint64_t offset = extra->offset;
|
|
6887
|
+
uint64_t offset = extra->offset + src1->view_offs;
|
|
6744
6888
|
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
|
6745
6889
|
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
|
6746
6890
|
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
|
@@ -6798,7 +6942,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6798
6942
|
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
|
6799
6943
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
|
6800
6944
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6801
|
-
uint64_t offset = extra->offset;
|
|
6945
|
+
uint64_t offset = extra->offset + src2->view_offs;
|
|
6802
6946
|
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
|
6803
6947
|
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
|
6804
6948
|
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
|
@@ -6846,8 +6990,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6846
6990
|
|
|
6847
6991
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
|
6848
6992
|
tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone);
|
|
6993
|
+
} else if (tensor->op == GGML_OP_MUL_MAT_ID) {
|
|
6994
|
+
tensor_clone = ggml_mul_mat_id(ggml_ctx, src0_clone, src1_clone, src2_clone);
|
|
6849
6995
|
} else if (tensor->op == GGML_OP_MUL) {
|
|
6850
6996
|
tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone);
|
|
6997
|
+
} else if (tensor->op == GGML_OP_DIV) {
|
|
6998
|
+
tensor_clone = ggml_div(ggml_ctx, src0_clone, src1_clone);
|
|
6851
6999
|
} else if (tensor->op == GGML_OP_SCALE) {
|
|
6852
7000
|
tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
|
|
6853
7001
|
} else if (tensor->op == GGML_OP_SQR) {
|
|
@@ -6871,15 +7019,15 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6871
7019
|
} else if (tensor->op == GGML_OP_ROPE) {
|
|
6872
7020
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
|
6873
7021
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
|
6874
|
-
const int
|
|
6875
|
-
const int
|
|
7022
|
+
//const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3];
|
|
7023
|
+
const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4];
|
|
6876
7024
|
float freq_base = ((float *) tensor->op_params)[5];
|
|
6877
7025
|
float freq_scale = ((float *) tensor->op_params)[6];
|
|
6878
7026
|
float ext_factor = ((float *) tensor->op_params)[7];
|
|
6879
7027
|
float attn_factor = ((float *) tensor->op_params)[8];
|
|
6880
7028
|
float beta_fast = ((float *) tensor->op_params)[9];
|
|
6881
7029
|
float beta_slow = ((float *) tensor->op_params)[10];
|
|
6882
|
-
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode,
|
|
7030
|
+
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
|
6883
7031
|
} else if (tensor->op == GGML_OP_UNARY) {
|
|
6884
7032
|
switch (ggml_get_unary_op(tensor)) {
|
|
6885
7033
|
case GGML_UNARY_OP_SILU:
|
|
@@ -6917,6 +7065,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6917
7065
|
tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
|
|
6918
7066
|
} else if (tensor->op == GGML_OP_ARGSORT) {
|
|
6919
7067
|
tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params);
|
|
7068
|
+
} else if (tensor->op == GGML_OP_SUM_ROWS) {
|
|
7069
|
+
tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
|
|
6920
7070
|
} else {
|
|
6921
7071
|
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
|
6922
7072
|
GGML_ASSERT(false);
|
|
@@ -6964,6 +7114,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6964
7114
|
|
|
6965
7115
|
ggml_tensor * src0 = tensor->src[0];
|
|
6966
7116
|
ggml_tensor * src1 = tensor->src[1];
|
|
7117
|
+
ggml_tensor * src2 = tensor->src[2];
|
|
6967
7118
|
|
|
6968
7119
|
void * tensor_data = tensor->data;
|
|
6969
7120
|
|
|
@@ -6974,11 +7125,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6974
7125
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
6975
7126
|
|
|
6976
7127
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6977
|
-
if (extra->offset + tensor_size >= buffer_gpu->size) {
|
|
6978
|
-
tensor_size = buffer_gpu->size - (extra->offset);
|
|
7128
|
+
if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
|
|
7129
|
+
tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
|
|
6979
7130
|
}
|
|
6980
7131
|
|
|
6981
|
-
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
|
7132
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
|
6982
7133
|
}
|
|
6983
7134
|
|
|
6984
7135
|
float first_error_result = -1.0f;
|
|
@@ -7022,6 +7173,9 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
7022
7173
|
if (src1 != nullptr) {
|
|
7023
7174
|
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
|
7024
7175
|
}
|
|
7176
|
+
if (src2 != nullptr) {
|
|
7177
|
+
std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
|
|
7178
|
+
}
|
|
7025
7179
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
|
7026
7180
|
std::cerr << std::endl << "Result:" << std::endl;
|
|
7027
7181
|
ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3);
|
|
@@ -7063,6 +7217,9 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
7063
7217
|
if (src1 != nullptr) {
|
|
7064
7218
|
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
|
7065
7219
|
}
|
|
7220
|
+
if (src2 != nullptr) {
|
|
7221
|
+
std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
|
|
7222
|
+
}
|
|
7066
7223
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
|
7067
7224
|
std::cerr << std::endl << "Result:" << std::endl;
|
|
7068
7225
|
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
|
@@ -7087,6 +7244,9 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
7087
7244
|
if (src1 != nullptr) {
|
|
7088
7245
|
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
|
7089
7246
|
}
|
|
7247
|
+
if (src2 != nullptr) {
|
|
7248
|
+
std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
|
|
7249
|
+
}
|
|
7090
7250
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
|
7091
7251
|
std::cerr << std::endl << "Result:" << std::endl;
|
|
7092
7252
|
ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]);
|