llama_cpp 0.16.0 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/extconf.rb +2 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +110 -53
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
- data/vendor/tmp/llama.cpp/ggml.c +102 -275
- data/vendor/tmp/llama.cpp/llama.cpp +103 -47
- data/vendor/tmp/llama.cpp/llama.h +4 -0
- metadata +15 -3
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#include "ggml-vulkan.h"
|
|
2
|
-
|
|
2
|
+
#include <vulkan/vulkan_core.h>
|
|
3
3
|
#ifdef GGML_VULKAN_RUN_TESTS
|
|
4
4
|
#include <chrono>
|
|
5
5
|
#endif
|
|
@@ -9,12 +9,13 @@
|
|
|
9
9
|
#include <algorithm>
|
|
10
10
|
#include <cmath>
|
|
11
11
|
#include <iostream>
|
|
12
|
-
#include <limits>
|
|
13
12
|
#include <tuple>
|
|
14
13
|
#include <vector>
|
|
15
14
|
#include <sstream>
|
|
16
15
|
#include <utility>
|
|
17
16
|
#include <memory>
|
|
17
|
+
#include <limits>
|
|
18
|
+
#include <map>
|
|
18
19
|
|
|
19
20
|
#include "ggml.h"
|
|
20
21
|
#include "ggml-backend-impl.h"
|
|
@@ -150,7 +151,7 @@ struct vk_device {
|
|
|
150
151
|
vk_pipeline pipeline_relu_f32;
|
|
151
152
|
vk_pipeline pipeline_diag_mask_inf_f32;
|
|
152
153
|
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
|
153
|
-
vk_pipeline
|
|
154
|
+
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
|
|
154
155
|
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
|
155
156
|
vk_pipeline pipeline_argsort_f32;
|
|
156
157
|
vk_pipeline pipeline_sum_rows_f32;
|
|
@@ -283,26 +284,15 @@ struct vk_op_diag_mask_push_constants {
|
|
|
283
284
|
|
|
284
285
|
struct vk_op_rope_push_constants {
|
|
285
286
|
uint32_t ncols;
|
|
287
|
+
uint32_t n_dims;
|
|
286
288
|
float freq_scale;
|
|
287
289
|
uint32_t p_delta_rows;
|
|
288
290
|
float freq_base;
|
|
289
291
|
float ext_factor;
|
|
290
292
|
float attn_factor;
|
|
291
|
-
float corr_dims[
|
|
292
|
-
};
|
|
293
|
-
|
|
294
|
-
struct vk_op_rope_neox_push_constants {
|
|
295
|
-
uint32_t ncols;
|
|
296
|
-
uint32_t ndims;
|
|
297
|
-
float freq_scale;
|
|
298
|
-
uint32_t p_delta_rows;
|
|
299
|
-
float freq_base;
|
|
300
|
-
float ext_factor;
|
|
301
|
-
float attn_factor;
|
|
302
|
-
float corr_dims[4];
|
|
293
|
+
float corr_dims[2];
|
|
303
294
|
float theta_scale;
|
|
304
|
-
|
|
305
|
-
uint32_t has_freq_facs;
|
|
295
|
+
uint32_t has_ff;
|
|
306
296
|
};
|
|
307
297
|
|
|
308
298
|
struct vk_op_soft_max_push_constants {
|
|
@@ -345,15 +335,12 @@ struct vk_context {
|
|
|
345
335
|
};
|
|
346
336
|
|
|
347
337
|
struct ggml_tensor_extra_gpu {
|
|
348
|
-
bool ready;
|
|
349
|
-
|
|
350
338
|
size_t ctx_idx;
|
|
351
339
|
|
|
352
340
|
vk_buffer_ref buffer_gpu;
|
|
353
341
|
uint64_t offset;
|
|
354
342
|
|
|
355
343
|
void reset() {
|
|
356
|
-
ready = false;
|
|
357
344
|
ctx_idx = 0;
|
|
358
345
|
buffer_gpu.reset();
|
|
359
346
|
offset = 0;
|
|
@@ -1537,11 +1524,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
|
1537
1524
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
|
1538
1525
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
|
1539
1526
|
|
|
1540
|
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
|
1541
|
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
|
1527
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1528
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1542
1529
|
|
|
1543
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(
|
|
1544
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(
|
|
1530
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1531
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
1545
1532
|
|
|
1546
1533
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
|
1547
1534
|
|
|
@@ -1569,8 +1556,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
|
1569
1556
|
vk::PhysicalDeviceProperties2 props2;
|
|
1570
1557
|
vk::PhysicalDeviceMaintenance3Properties props3;
|
|
1571
1558
|
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
|
1559
|
+
vk::PhysicalDeviceDriverProperties driver_props;
|
|
1572
1560
|
props2.pNext = &props3;
|
|
1573
1561
|
props3.pNext = &subgroup_props;
|
|
1562
|
+
subgroup_props.pNext = &driver_props;
|
|
1574
1563
|
physical_device.getProperties2(&props2);
|
|
1575
1564
|
|
|
1576
1565
|
const size_t subgroup_size = subgroup_props.subgroupSize;
|
|
@@ -1614,7 +1603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
|
1614
1603
|
fp16 = fp16 && vk12_features.shaderFloat16;
|
|
1615
1604
|
|
|
1616
1605
|
std::string device_name = props2.properties.deviceName.data();
|
|
1617
|
-
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
|
1606
|
+
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
|
1618
1607
|
|
|
1619
1608
|
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
|
1620
1609
|
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
|
@@ -1710,7 +1699,78 @@ void ggml_vk_instance_init() {
|
|
|
1710
1699
|
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
|
1711
1700
|
|
|
1712
1701
|
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
|
1713
|
-
|
|
1702
|
+
// Check if there are two physical devices corresponding to the same GPU
|
|
1703
|
+
auto old_device = std::find_if(
|
|
1704
|
+
vk_instance.device_indices.begin(),
|
|
1705
|
+
vk_instance.device_indices.end(),
|
|
1706
|
+
[&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
|
|
1707
|
+
);
|
|
1708
|
+
if (old_device == vk_instance.device_indices.end()) {
|
|
1709
|
+
vk_instance.device_indices.push_back(i);
|
|
1710
|
+
} else {
|
|
1711
|
+
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
|
|
1712
|
+
// This can cause error when splitting layers aross the devices, need to keep only 1
|
|
1713
|
+
#ifdef GGML_VULKAN_DEBUG
|
|
1714
|
+
std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
|
|
1715
|
+
#endif
|
|
1716
|
+
|
|
1717
|
+
vk::PhysicalDeviceProperties2 old_prop;
|
|
1718
|
+
vk::PhysicalDeviceDriverProperties old_driver;
|
|
1719
|
+
old_prop.pNext = &old_driver;
|
|
1720
|
+
devices[*old_device].getProperties2(&old_prop);
|
|
1721
|
+
|
|
1722
|
+
vk::PhysicalDeviceProperties2 new_prop;
|
|
1723
|
+
vk::PhysicalDeviceDriverProperties new_driver;
|
|
1724
|
+
new_prop.pNext = &new_driver;
|
|
1725
|
+
devices[i].getProperties2(&new_prop);
|
|
1726
|
+
|
|
1727
|
+
std::map<vk::DriverId, int> driver_priorities {};
|
|
1728
|
+
int old_priority = std::numeric_limits<int>::max();
|
|
1729
|
+
int new_priority = std::numeric_limits<int>::max();
|
|
1730
|
+
|
|
1731
|
+
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
|
|
1732
|
+
// Smaller number -> higher priority
|
|
1733
|
+
switch (old_prop.properties.vendorID) {
|
|
1734
|
+
case VK_VENDOR_ID_AMD:
|
|
1735
|
+
driver_priorities[vk::DriverId::eMesaRadv] = 1;
|
|
1736
|
+
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
|
|
1737
|
+
driver_priorities[vk::DriverId::eAmdProprietary] = 3;
|
|
1738
|
+
break;
|
|
1739
|
+
case VK_VENDOR_ID_INTEL:
|
|
1740
|
+
driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
|
|
1741
|
+
driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
|
|
1742
|
+
break;
|
|
1743
|
+
case VK_VENDOR_ID_NVIDIA:
|
|
1744
|
+
driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
|
|
1745
|
+
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
|
|
1746
|
+
driver_priorities[vk::DriverId::eMesaNvk] = 2;
|
|
1747
|
+
#endif
|
|
1748
|
+
break;
|
|
1749
|
+
}
|
|
1750
|
+
|
|
1751
|
+
if (driver_priorities.count(old_driver.driverID)) {
|
|
1752
|
+
old_priority = driver_priorities[old_driver.driverID];
|
|
1753
|
+
}
|
|
1754
|
+
if (driver_priorities.count(new_driver.driverID)) {
|
|
1755
|
+
new_priority = driver_priorities[new_driver.driverID];
|
|
1756
|
+
}
|
|
1757
|
+
|
|
1758
|
+
if (new_priority < old_priority) {
|
|
1759
|
+
auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
|
|
1760
|
+
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
|
|
1761
|
+
vk_instance.device_indices.push_back(i);
|
|
1762
|
+
|
|
1763
|
+
#ifdef GGML_VULKAN_DEBUG
|
|
1764
|
+
std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
|
|
1765
|
+
#endif
|
|
1766
|
+
}
|
|
1767
|
+
#ifdef GGML_VULKAN_DEBUG
|
|
1768
|
+
else {
|
|
1769
|
+
std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
|
|
1770
|
+
|
|
1771
|
+
}
|
|
1772
|
+
#endif
|
|
1773
|
+
}
|
|
1714
1774
|
}
|
|
1715
1775
|
}
|
|
1716
1776
|
|
|
@@ -2949,7 +3009,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2949
3009
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
2950
3010
|
|
|
2951
3011
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
2952
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3012
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
2953
3013
|
GGML_ASSERT(d_D != nullptr);
|
|
2954
3014
|
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
|
2955
3015
|
vk_buffer d_X;
|
|
@@ -2958,12 +3018,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
|
2958
3018
|
uint64_t y_buf_offset = 0;
|
|
2959
3019
|
if (!src0_uma) {
|
|
2960
3020
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
2961
|
-
qx_buf_offset = extra_src0->offset;
|
|
3021
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
2962
3022
|
GGML_ASSERT(d_Qx != nullptr);
|
|
2963
3023
|
}
|
|
2964
3024
|
if (!src1_uma) {
|
|
2965
3025
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
2966
|
-
qy_buf_offset = extra_src1->offset;
|
|
3026
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
2967
3027
|
GGML_ASSERT(d_Qy != nullptr);
|
|
2968
3028
|
}
|
|
2969
3029
|
if (qx_needs_dequant) {
|
|
@@ -3114,7 +3174,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3114
3174
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3115
3175
|
|
|
3116
3176
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3117
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3177
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3118
3178
|
GGML_ASSERT(d_D != nullptr);
|
|
3119
3179
|
vk_buffer d_X;
|
|
3120
3180
|
uint64_t x_buf_offset = 0;
|
|
@@ -3122,12 +3182,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
|
3122
3182
|
uint64_t y_buf_offset = 0;
|
|
3123
3183
|
if(!src0_uma) {
|
|
3124
3184
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
3125
|
-
qx_buf_offset = extra_src0->offset;
|
|
3185
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3126
3186
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3127
3187
|
}
|
|
3128
3188
|
if(!src1_uma) {
|
|
3129
3189
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3130
|
-
qy_buf_offset = extra_src1->offset;
|
|
3190
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3131
3191
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3132
3192
|
}
|
|
3133
3193
|
if (qx_needs_dequant) {
|
|
@@ -3246,14 +3306,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
3246
3306
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3247
3307
|
|
|
3248
3308
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3249
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3309
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3250
3310
|
GGML_ASSERT(d_D != nullptr);
|
|
3251
3311
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
|
3252
|
-
const uint64_t qx_buf_offset = extra_src0->offset;
|
|
3312
|
+
const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3253
3313
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3254
3314
|
if (!src1_uma) {
|
|
3255
3315
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3256
|
-
qy_buf_offset = extra_src1->offset;
|
|
3316
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3257
3317
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3258
3318
|
}
|
|
3259
3319
|
|
|
@@ -3323,14 +3383,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
3323
3383
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3324
3384
|
|
|
3325
3385
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3326
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3386
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3327
3387
|
GGML_ASSERT(d_D != nullptr);
|
|
3328
3388
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
|
3329
|
-
const uint64_t qx_buf_offset = extra_src0->offset;
|
|
3389
|
+
const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3330
3390
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3331
3391
|
if (!src1_uma) {
|
|
3332
3392
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3333
|
-
qy_buf_offset = extra_src1->offset;
|
|
3393
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3334
3394
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3335
3395
|
}
|
|
3336
3396
|
|
|
@@ -3459,7 +3519,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
|
3459
3519
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3460
3520
|
|
|
3461
3521
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3462
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3522
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3463
3523
|
GGML_ASSERT(d_D != nullptr);
|
|
3464
3524
|
vk_buffer d_X;
|
|
3465
3525
|
uint64_t x_buf_offset = 0;
|
|
@@ -3467,17 +3527,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
|
3467
3527
|
uint64_t y_buf_offset = 0;
|
|
3468
3528
|
if (!src0_uma) {
|
|
3469
3529
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
3470
|
-
qx_buf_offset = extra_src0->offset;
|
|
3530
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3471
3531
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3472
3532
|
}
|
|
3473
3533
|
if (!src1_uma) {
|
|
3474
3534
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3475
|
-
qy_buf_offset = extra_src1->offset;
|
|
3535
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3476
3536
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3477
3537
|
}
|
|
3478
3538
|
if (!ids_uma) {
|
|
3479
3539
|
d_ids = extra_ids->buffer_gpu.lock();
|
|
3480
|
-
ids_buf_offset = extra_ids->offset;
|
|
3540
|
+
ids_buf_offset = extra_ids->offset + ids->view_offs;
|
|
3481
3541
|
GGML_ASSERT(d_ids != nullptr);
|
|
3482
3542
|
}
|
|
3483
3543
|
if (qx_needs_dequant) {
|
|
@@ -3636,7 +3696,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3636
3696
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
3637
3697
|
|
|
3638
3698
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
|
3639
|
-
const uint64_t d_buf_offset = extra->offset;
|
|
3699
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
|
3640
3700
|
GGML_ASSERT(d_D != nullptr);
|
|
3641
3701
|
vk_buffer d_X;
|
|
3642
3702
|
uint64_t x_buf_offset = 0;
|
|
@@ -3644,17 +3704,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
3644
3704
|
uint64_t y_buf_offset = 0;
|
|
3645
3705
|
if(!src0_uma) {
|
|
3646
3706
|
d_Qx = extra_src0->buffer_gpu.lock();
|
|
3647
|
-
qx_buf_offset = extra_src0->offset;
|
|
3707
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
|
3648
3708
|
GGML_ASSERT(d_Qx != nullptr);
|
|
3649
3709
|
}
|
|
3650
3710
|
if(!src1_uma) {
|
|
3651
3711
|
d_Qy = extra_src1->buffer_gpu.lock();
|
|
3652
|
-
qy_buf_offset = extra_src1->offset;
|
|
3712
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
|
3653
3713
|
GGML_ASSERT(d_Qy != nullptr);
|
|
3654
3714
|
}
|
|
3655
3715
|
if(!ids_uma) {
|
|
3656
3716
|
d_ids = extra_ids->buffer_gpu.lock();
|
|
3657
|
-
ids_buf_offset = extra_ids->offset;
|
|
3717
|
+
ids_buf_offset = extra_ids->offset + ids->view_offs;
|
|
3658
3718
|
GGML_ASSERT(d_ids != nullptr);
|
|
3659
3719
|
}
|
|
3660
3720
|
if (qx_needs_dequant) {
|
|
@@ -3769,9 +3829,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
|
3769
3829
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
|
3770
3830
|
|
|
3771
3831
|
const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
|
|
3772
|
-
const uint64_t src_offset = extra_src0->offset;
|
|
3832
|
+
const uint64_t src_offset = extra_src0->offset + src0->view_offs;
|
|
3773
3833
|
vk_buffer dst_buf = extra->buffer_gpu.lock();
|
|
3774
|
-
const uint64_t dst_offset = extra->offset;
|
|
3834
|
+
const uint64_t dst_offset = extra->offset + dst->view_offs;
|
|
3775
3835
|
|
|
3776
3836
|
std::vector<vk::BufferCopy> copies;
|
|
3777
3837
|
|
|
@@ -3908,10 +3968,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
3908
3968
|
}
|
|
3909
3969
|
} else {
|
|
3910
3970
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
3911
|
-
return ctx->device->
|
|
3971
|
+
return ctx->device->pipeline_rope_norm_f32;
|
|
3912
3972
|
}
|
|
3913
3973
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
3914
|
-
return ctx->device->
|
|
3974
|
+
return ctx->device->pipeline_rope_norm_f16;
|
|
3915
3975
|
}
|
|
3916
3976
|
}
|
|
3917
3977
|
return nullptr;
|
|
@@ -4062,21 +4122,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4062
4122
|
}
|
|
4063
4123
|
|
|
4064
4124
|
GGML_ASSERT(d_D != nullptr);
|
|
4065
|
-
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
|
4125
|
+
uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
|
4066
4126
|
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
|
4067
4127
|
if(!src0_uma) {
|
|
4068
4128
|
d_X = extra_src0->buffer_gpu.lock();
|
|
4069
|
-
x_buf_offset = extra_src0->offset;
|
|
4129
|
+
x_buf_offset = extra_src0->offset + src0->view_offs;
|
|
4070
4130
|
GGML_ASSERT(d_X != nullptr);
|
|
4071
4131
|
}
|
|
4072
4132
|
if (use_src1 && !src1_uma) {
|
|
4073
4133
|
d_Y = extra_src1->buffer_gpu.lock();
|
|
4074
|
-
y_buf_offset = extra_src1->offset;
|
|
4134
|
+
y_buf_offset = extra_src1->offset + src1->view_offs;
|
|
4075
4135
|
GGML_ASSERT(d_Y != nullptr);
|
|
4076
4136
|
}
|
|
4077
4137
|
if (use_src2 && !src2_uma) {
|
|
4078
4138
|
d_Z = extra_src2->buffer_gpu.lock();
|
|
4079
|
-
z_buf_offset = extra_src2->offset;
|
|
4139
|
+
z_buf_offset = extra_src2->offset + src2->view_offs;
|
|
4080
4140
|
GGML_ASSERT(d_Z != nullptr);
|
|
4081
4141
|
}
|
|
4082
4142
|
|
|
@@ -4155,24 +4215,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
|
4155
4215
|
ggml_vk_sync_buffers(subctx);
|
|
4156
4216
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4157
4217
|
} else if (op == GGML_OP_ROPE) {
|
|
4158
|
-
|
|
4159
|
-
|
|
4160
|
-
|
|
4161
|
-
|
|
4162
|
-
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
4163
|
-
vk_subbuffer subbuf_z;
|
|
4164
|
-
if (use_src2) {
|
|
4165
|
-
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
|
4166
|
-
} else {
|
|
4167
|
-
subbuf_z = { d_X, 0, d_X->size };
|
|
4168
|
-
}
|
|
4169
|
-
|
|
4170
|
-
ggml_vk_sync_buffers(subctx);
|
|
4171
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4218
|
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
4219
|
+
vk_subbuffer subbuf_z;
|
|
4220
|
+
if (use_src2) {
|
|
4221
|
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
|
4172
4222
|
} else {
|
|
4173
|
-
|
|
4174
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4223
|
+
subbuf_z = { d_X, 0, d_X->size };
|
|
4175
4224
|
}
|
|
4225
|
+
|
|
4226
|
+
ggml_vk_sync_buffers(subctx);
|
|
4227
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
4176
4228
|
} else if (use_src2) {
|
|
4177
4229
|
ggml_vk_sync_buffers(subctx);
|
|
4178
4230
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
|
@@ -4336,7 +4388,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
|
4336
4388
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
4337
4389
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
4338
4390
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
4339
|
-
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
|
4391
|
+
const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
|
4340
4392
|
|
|
4341
4393
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
|
4342
4394
|
(uint32_t)ggml_nelements(src0),
|
|
@@ -4394,7 +4446,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
|
4394
4446
|
|
|
4395
4447
|
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
4396
4448
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
4397
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
|
4449
|
+
// const int mode = ((int32_t *) dst->op_params)[2];
|
|
4398
4450
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
4399
4451
|
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
4400
4452
|
const float freq_base = ((float *) dst->op_params)[5];
|
|
@@ -4404,28 +4456,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
|
4404
4456
|
const float beta_fast = ((float *) dst->op_params)[9];
|
|
4405
4457
|
const float beta_slow = ((float *) dst->op_params)[10];
|
|
4406
4458
|
|
|
4407
|
-
const bool is_neox = mode & 2;
|
|
4408
|
-
|
|
4409
|
-
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
|
4410
|
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
|
4411
|
-
|
|
4412
4459
|
float corr_dims[2];
|
|
4413
4460
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
4414
4461
|
|
|
4415
|
-
|
|
4416
|
-
|
|
4417
|
-
|
|
4418
|
-
|
|
4419
|
-
|
|
4420
|
-
|
|
4421
|
-
|
|
4422
|
-
});
|
|
4423
|
-
} else {
|
|
4424
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
|
4425
|
-
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
|
4426
|
-
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
|
4427
|
-
});
|
|
4428
|
-
}
|
|
4462
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
4463
|
+
|
|
4464
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
|
4465
|
+
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
|
4466
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
|
|
4467
|
+
src2 != nullptr,
|
|
4468
|
+
});
|
|
4429
4469
|
}
|
|
4430
4470
|
|
|
4431
4471
|
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -5569,6 +5609,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5569
5609
|
const ggml_tensor * src2 = node->src[2];
|
|
5570
5610
|
|
|
5571
5611
|
switch (node->op) {
|
|
5612
|
+
// Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
|
|
5613
|
+
case GGML_OP_RESHAPE:
|
|
5614
|
+
case GGML_OP_VIEW:
|
|
5615
|
+
case GGML_OP_PERMUTE:
|
|
5616
|
+
case GGML_OP_TRANSPOSE:
|
|
5617
|
+
case GGML_OP_NONE:
|
|
5618
|
+
return;
|
|
5572
5619
|
case GGML_OP_UNARY:
|
|
5573
5620
|
switch (ggml_get_unary_op(node)) {
|
|
5574
5621
|
case GGML_UNARY_OP_SILU:
|
|
@@ -5590,10 +5637,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5590
5637
|
case GGML_OP_CPY:
|
|
5591
5638
|
case GGML_OP_CONT:
|
|
5592
5639
|
case GGML_OP_DUP:
|
|
5593
|
-
case GGML_OP_RESHAPE:
|
|
5594
|
-
case GGML_OP_VIEW:
|
|
5595
|
-
case GGML_OP_PERMUTE:
|
|
5596
|
-
case GGML_OP_TRANSPOSE:
|
|
5597
5640
|
case GGML_OP_NORM:
|
|
5598
5641
|
case GGML_OP_RMS_NORM:
|
|
5599
5642
|
case GGML_OP_DIAG_MASK_INF:
|
|
@@ -5601,7 +5644,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5601
5644
|
case GGML_OP_ROPE:
|
|
5602
5645
|
case GGML_OP_MUL_MAT:
|
|
5603
5646
|
case GGML_OP_MUL_MAT_ID:
|
|
5604
|
-
case GGML_OP_NONE:
|
|
5605
5647
|
case GGML_OP_ARGSORT:
|
|
5606
5648
|
case GGML_OP_SUM_ROWS:
|
|
5607
5649
|
break;
|
|
@@ -5654,12 +5696,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5654
5696
|
case GGML_OP_DUP:
|
|
5655
5697
|
ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
|
|
5656
5698
|
|
|
5657
|
-
break;
|
|
5658
|
-
case GGML_OP_RESHAPE:
|
|
5659
|
-
case GGML_OP_VIEW:
|
|
5660
|
-
case GGML_OP_PERMUTE:
|
|
5661
|
-
case GGML_OP_TRANSPOSE:
|
|
5662
|
-
case GGML_OP_NONE:
|
|
5663
5699
|
break;
|
|
5664
5700
|
case GGML_OP_NORM:
|
|
5665
5701
|
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
|
@@ -5712,7 +5748,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
5712
5748
|
return;
|
|
5713
5749
|
}
|
|
5714
5750
|
|
|
5715
|
-
extra->ready = true;
|
|
5716
5751
|
extra->ctx_idx = ctx->compute_ctx->idx;
|
|
5717
5752
|
|
|
5718
5753
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
@@ -5796,8 +5831,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5796
5831
|
ggml_vk_check_results_0(ctx, params, tensor);
|
|
5797
5832
|
#endif
|
|
5798
5833
|
|
|
5799
|
-
GGML_ASSERT(extra->ready);
|
|
5800
|
-
|
|
5801
5834
|
vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
|
|
5802
5835
|
|
|
5803
5836
|
// Only run if ctx hasn't been submitted yet
|
|
@@ -5822,8 +5855,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
5822
5855
|
subctx.out_memcpys.clear();
|
|
5823
5856
|
}
|
|
5824
5857
|
|
|
5825
|
-
extra->ready = false;
|
|
5826
|
-
|
|
5827
5858
|
return true;
|
|
5828
5859
|
}
|
|
5829
5860
|
|
|
@@ -5943,7 +5974,9 @@ struct ggml_backend_vk_buffer_context {
|
|
|
5943
5974
|
|
|
5944
5975
|
~ggml_backend_vk_buffer_context() {
|
|
5945
5976
|
ggml_vk_destroy_buffer(dev_buffer);
|
|
5946
|
-
|
|
5977
|
+
if (temp_tensor_extras != nullptr) {
|
|
5978
|
+
delete[] temp_tensor_extras;
|
|
5979
|
+
}
|
|
5947
5980
|
}
|
|
5948
5981
|
|
|
5949
5982
|
ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
|
|
@@ -5990,18 +6023,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
|
5990
6023
|
#endif
|
|
5991
6024
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
5992
6025
|
|
|
5993
|
-
|
|
5994
|
-
if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
|
|
6026
|
+
if (tensor->view_src != nullptr) {
|
|
5995
6027
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
|
5996
|
-
|
|
5997
|
-
extra
|
|
5998
|
-
extra->offset = extra_view->offset + tensor->view_offs;
|
|
6028
|
+
GGML_ASSERT(tensor->view_src->extra != nullptr);
|
|
6029
|
+
tensor->extra = tensor->view_src->extra;
|
|
5999
6030
|
} else {
|
|
6031
|
+
ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
|
|
6000
6032
|
extra->buffer_gpu = ctx->dev_buffer;
|
|
6001
6033
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
|
6034
|
+
tensor->extra = extra;
|
|
6002
6035
|
}
|
|
6003
|
-
|
|
6004
|
-
tensor->extra = extra;
|
|
6005
6036
|
}
|
|
6006
6037
|
|
|
6007
6038
|
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
@@ -6014,7 +6045,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
|
6014
6045
|
|
|
6015
6046
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
6016
6047
|
|
|
6017
|
-
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
|
|
6048
|
+
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6018
6049
|
}
|
|
6019
6050
|
|
|
6020
6051
|
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
@@ -6027,7 +6058,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
|
6027
6058
|
|
|
6028
6059
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
6029
6060
|
|
|
6030
|
-
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
|
|
6061
|
+
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6031
6062
|
}
|
|
6032
6063
|
|
|
6033
6064
|
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
|
@@ -6038,7 +6069,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|
|
6038
6069
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
|
6039
6070
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
|
6040
6071
|
|
|
6041
|
-
ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
|
6072
|
+
ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
|
|
6042
6073
|
|
|
6043
6074
|
return true;
|
|
6044
6075
|
}
|
|
@@ -6082,7 +6113,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
|
|
6082
6113
|
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
|
6083
6114
|
#endif
|
|
6084
6115
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
|
6085
|
-
|
|
6116
|
+
|
|
6117
|
+
vk_buffer dev_buffer = nullptr;
|
|
6118
|
+
try {
|
|
6119
|
+
dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
|
|
6120
|
+
} catch (const vk::SystemError& e) {
|
|
6121
|
+
return nullptr;
|
|
6122
|
+
}
|
|
6086
6123
|
|
|
6087
6124
|
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
|
|
6088
6125
|
|
|
@@ -6105,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
|
|
|
6105
6142
|
UNUSED(buft);
|
|
6106
6143
|
}
|
|
6107
6144
|
|
|
6108
|
-
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
6109
|
-
if (!ggml_backend_is_vk(backend)) {
|
|
6110
|
-
return false;
|
|
6111
|
-
}
|
|
6112
|
-
|
|
6113
|
-
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
|
6114
|
-
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6115
|
-
|
|
6116
|
-
return buft_ctx->ctx->idx == ctx->idx;
|
|
6117
|
-
}
|
|
6118
|
-
|
|
6119
6145
|
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
6120
6146
|
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
|
6121
6147
|
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
|
6122
6148
|
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
|
6123
6149
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
|
6124
6150
|
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
|
6125
|
-
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
|
|
6126
6151
|
/* .is_host = */ NULL,
|
|
6127
6152
|
};
|
|
6128
6153
|
|
|
@@ -6198,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
|
6198
6223
|
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
|
6199
6224
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
6200
6225
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
6201
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
6202
6226
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
6203
6227
|
},
|
|
6204
6228
|
/* .context = */ nullptr,
|
|
@@ -6264,7 +6288,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
|
6264
6288
|
|
|
6265
6289
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
6266
6290
|
|
|
6267
|
-
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
|
|
6291
|
+
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6268
6292
|
}
|
|
6269
6293
|
|
|
6270
6294
|
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
@@ -6284,7 +6308,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
|
6284
6308
|
|
|
6285
6309
|
vk_buffer buf = extra->buffer_gpu.lock();
|
|
6286
6310
|
|
|
6287
|
-
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
|
|
6311
|
+
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
|
6288
6312
|
}
|
|
6289
6313
|
|
|
6290
6314
|
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
|
@@ -6305,7 +6329,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
|
6305
6329
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
|
6306
6330
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
|
6307
6331
|
|
|
6308
|
-
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
|
6332
|
+
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
|
|
6309
6333
|
return true;
|
|
6310
6334
|
}
|
|
6311
6335
|
|
|
@@ -6402,7 +6426,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6402
6426
|
case GGML_UNARY_OP_GELU:
|
|
6403
6427
|
case GGML_UNARY_OP_SILU:
|
|
6404
6428
|
case GGML_UNARY_OP_RELU:
|
|
6405
|
-
return
|
|
6429
|
+
return ggml_is_contiguous(op->src[0]);
|
|
6406
6430
|
default:
|
|
6407
6431
|
return false;
|
|
6408
6432
|
}
|
|
@@ -6478,11 +6502,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
|
6478
6502
|
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
|
6479
6503
|
// } break;
|
|
6480
6504
|
case GGML_OP_ROPE:
|
|
6481
|
-
|
|
6482
|
-
const int mode = ((const int32_t *) op->op_params)[2];
|
|
6483
|
-
|
|
6484
|
-
return true;
|
|
6485
|
-
} break;
|
|
6505
|
+
return ggml_is_contiguous(op->src[0]);
|
|
6486
6506
|
case GGML_OP_NONE:
|
|
6487
6507
|
case GGML_OP_RESHAPE:
|
|
6488
6508
|
case GGML_OP_VIEW:
|
|
@@ -6518,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
|
|
|
6518
6538
|
UNUSED(backend);
|
|
6519
6539
|
}
|
|
6520
6540
|
|
|
6541
|
+
GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
6542
|
+
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
|
6543
|
+
return false;
|
|
6544
|
+
}
|
|
6545
|
+
|
|
6546
|
+
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
|
6547
|
+
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
6548
|
+
|
|
6549
|
+
return buft_ctx->ctx->idx == ctx->idx;
|
|
6550
|
+
}
|
|
6551
|
+
|
|
6521
6552
|
// TODO: enable async and synchronize
|
|
6522
6553
|
static ggml_backend_i ggml_backend_vk_interface = {
|
|
6523
6554
|
/* .get_name = */ ggml_backend_vk_name,
|
|
@@ -6529,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
|
6529
6560
|
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
|
6530
6561
|
/* .graph_plan_create = */ NULL,
|
|
6531
6562
|
/* .graph_plan_free = */ NULL,
|
|
6563
|
+
/* .graph_plan_update = */ NULL,
|
|
6532
6564
|
/* .graph_plan_compute = */ NULL,
|
|
6533
6565
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
|
6534
6566
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
|
6567
|
+
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
|
6535
6568
|
/* .offload_op = */ ggml_backend_vk_offload_op,
|
|
6536
6569
|
/* .event_new = */ NULL,
|
|
6537
6570
|
/* .event_free = */ NULL,
|
|
@@ -6725,7 +6758,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
|
6725
6758
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
6726
6759
|
|
|
6727
6760
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6728
|
-
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
|
6761
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
|
6729
6762
|
}
|
|
6730
6763
|
|
|
6731
6764
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
|
@@ -6809,7 +6842,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6809
6842
|
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
|
6810
6843
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
6811
6844
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6812
|
-
uint64_t offset = extra->offset;
|
|
6845
|
+
uint64_t offset = extra->offset + src0->view_offs;
|
|
6813
6846
|
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
|
6814
6847
|
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
|
6815
6848
|
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
|
@@ -6851,7 +6884,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6851
6884
|
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
|
6852
6885
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
6853
6886
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6854
|
-
uint64_t offset = extra->offset;
|
|
6887
|
+
uint64_t offset = extra->offset + src1->view_offs;
|
|
6855
6888
|
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
|
6856
6889
|
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
|
6857
6890
|
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
|
@@ -6909,7 +6942,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
6909
6942
|
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
|
6910
6943
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
|
6911
6944
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
6912
|
-
uint64_t offset = extra->offset;
|
|
6945
|
+
uint64_t offset = extra->offset + src2->view_offs;
|
|
6913
6946
|
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
|
6914
6947
|
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
|
6915
6948
|
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
|
@@ -7092,11 +7125,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
|
7092
7125
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
7093
7126
|
|
|
7094
7127
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
|
7095
|
-
if (extra->offset + tensor_size >= buffer_gpu->size) {
|
|
7096
|
-
tensor_size = buffer_gpu->size - (extra->offset);
|
|
7128
|
+
if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
|
|
7129
|
+
tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
|
|
7097
7130
|
}
|
|
7098
7131
|
|
|
7099
|
-
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
|
7132
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
|
7100
7133
|
}
|
|
7101
7134
|
|
|
7102
7135
|
float first_error_result = -1.0f;
|