llama_cpp 0.16.0 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/extconf.rb +2 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +110 -53
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
- data/vendor/tmp/llama.cpp/ggml.c +102 -275
- data/vendor/tmp/llama.cpp/llama.cpp +103 -47
- data/vendor/tmp/llama.cpp/llama.h +4 -0
- metadata +15 -3
@@ -1,5 +1,5 @@
|
|
1
1
|
#include "ggml-vulkan.h"
|
2
|
-
|
2
|
+
#include <vulkan/vulkan_core.h>
|
3
3
|
#ifdef GGML_VULKAN_RUN_TESTS
|
4
4
|
#include <chrono>
|
5
5
|
#endif
|
@@ -9,12 +9,13 @@
|
|
9
9
|
#include <algorithm>
|
10
10
|
#include <cmath>
|
11
11
|
#include <iostream>
|
12
|
-
#include <limits>
|
13
12
|
#include <tuple>
|
14
13
|
#include <vector>
|
15
14
|
#include <sstream>
|
16
15
|
#include <utility>
|
17
16
|
#include <memory>
|
17
|
+
#include <limits>
|
18
|
+
#include <map>
|
18
19
|
|
19
20
|
#include "ggml.h"
|
20
21
|
#include "ggml-backend-impl.h"
|
@@ -150,7 +151,7 @@ struct vk_device {
|
|
150
151
|
vk_pipeline pipeline_relu_f32;
|
151
152
|
vk_pipeline pipeline_diag_mask_inf_f32;
|
152
153
|
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
153
|
-
vk_pipeline
|
154
|
+
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
|
154
155
|
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
155
156
|
vk_pipeline pipeline_argsort_f32;
|
156
157
|
vk_pipeline pipeline_sum_rows_f32;
|
@@ -283,26 +284,15 @@ struct vk_op_diag_mask_push_constants {
|
|
283
284
|
|
284
285
|
struct vk_op_rope_push_constants {
|
285
286
|
uint32_t ncols;
|
287
|
+
uint32_t n_dims;
|
286
288
|
float freq_scale;
|
287
289
|
uint32_t p_delta_rows;
|
288
290
|
float freq_base;
|
289
291
|
float ext_factor;
|
290
292
|
float attn_factor;
|
291
|
-
float corr_dims[
|
292
|
-
};
|
293
|
-
|
294
|
-
struct vk_op_rope_neox_push_constants {
|
295
|
-
uint32_t ncols;
|
296
|
-
uint32_t ndims;
|
297
|
-
float freq_scale;
|
298
|
-
uint32_t p_delta_rows;
|
299
|
-
float freq_base;
|
300
|
-
float ext_factor;
|
301
|
-
float attn_factor;
|
302
|
-
float corr_dims[4];
|
293
|
+
float corr_dims[2];
|
303
294
|
float theta_scale;
|
304
|
-
|
305
|
-
uint32_t has_freq_facs;
|
295
|
+
uint32_t has_ff;
|
306
296
|
};
|
307
297
|
|
308
298
|
struct vk_op_soft_max_push_constants {
|
@@ -345,15 +335,12 @@ struct vk_context {
|
|
345
335
|
};
|
346
336
|
|
347
337
|
struct ggml_tensor_extra_gpu {
|
348
|
-
bool ready;
|
349
|
-
|
350
338
|
size_t ctx_idx;
|
351
339
|
|
352
340
|
vk_buffer_ref buffer_gpu;
|
353
341
|
uint64_t offset;
|
354
342
|
|
355
343
|
void reset() {
|
356
|
-
ready = false;
|
357
344
|
ctx_idx = 0;
|
358
345
|
buffer_gpu.reset();
|
359
346
|
offset = 0;
|
@@ -1537,11 +1524,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1537
1524
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1538
1525
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1539
1526
|
|
1540
|
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
1541
|
-
ggml_vk_create_pipeline(ctx, ctx->device->
|
1527
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1528
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1542
1529
|
|
1543
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(
|
1544
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(
|
1530
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1531
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1545
1532
|
|
1546
1533
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
1547
1534
|
|
@@ -1569,8 +1556,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
1569
1556
|
vk::PhysicalDeviceProperties2 props2;
|
1570
1557
|
vk::PhysicalDeviceMaintenance3Properties props3;
|
1571
1558
|
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
1559
|
+
vk::PhysicalDeviceDriverProperties driver_props;
|
1572
1560
|
props2.pNext = &props3;
|
1573
1561
|
props3.pNext = &subgroup_props;
|
1562
|
+
subgroup_props.pNext = &driver_props;
|
1574
1563
|
physical_device.getProperties2(&props2);
|
1575
1564
|
|
1576
1565
|
const size_t subgroup_size = subgroup_props.subgroupSize;
|
@@ -1614,7 +1603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
1614
1603
|
fp16 = fp16 && vk12_features.shaderFloat16;
|
1615
1604
|
|
1616
1605
|
std::string device_name = props2.properties.deviceName.data();
|
1617
|
-
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
1606
|
+
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
1618
1607
|
|
1619
1608
|
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
1620
1609
|
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
@@ -1710,7 +1699,78 @@ void ggml_vk_instance_init() {
|
|
1710
1699
|
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
1711
1700
|
|
1712
1701
|
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
1713
|
-
|
1702
|
+
// Check if there are two physical devices corresponding to the same GPU
|
1703
|
+
auto old_device = std::find_if(
|
1704
|
+
vk_instance.device_indices.begin(),
|
1705
|
+
vk_instance.device_indices.end(),
|
1706
|
+
[&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
|
1707
|
+
);
|
1708
|
+
if (old_device == vk_instance.device_indices.end()) {
|
1709
|
+
vk_instance.device_indices.push_back(i);
|
1710
|
+
} else {
|
1711
|
+
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
|
1712
|
+
// This can cause error when splitting layers aross the devices, need to keep only 1
|
1713
|
+
#ifdef GGML_VULKAN_DEBUG
|
1714
|
+
std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
|
1715
|
+
#endif
|
1716
|
+
|
1717
|
+
vk::PhysicalDeviceProperties2 old_prop;
|
1718
|
+
vk::PhysicalDeviceDriverProperties old_driver;
|
1719
|
+
old_prop.pNext = &old_driver;
|
1720
|
+
devices[*old_device].getProperties2(&old_prop);
|
1721
|
+
|
1722
|
+
vk::PhysicalDeviceProperties2 new_prop;
|
1723
|
+
vk::PhysicalDeviceDriverProperties new_driver;
|
1724
|
+
new_prop.pNext = &new_driver;
|
1725
|
+
devices[i].getProperties2(&new_prop);
|
1726
|
+
|
1727
|
+
std::map<vk::DriverId, int> driver_priorities {};
|
1728
|
+
int old_priority = std::numeric_limits<int>::max();
|
1729
|
+
int new_priority = std::numeric_limits<int>::max();
|
1730
|
+
|
1731
|
+
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
|
1732
|
+
// Smaller number -> higher priority
|
1733
|
+
switch (old_prop.properties.vendorID) {
|
1734
|
+
case VK_VENDOR_ID_AMD:
|
1735
|
+
driver_priorities[vk::DriverId::eMesaRadv] = 1;
|
1736
|
+
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
|
1737
|
+
driver_priorities[vk::DriverId::eAmdProprietary] = 3;
|
1738
|
+
break;
|
1739
|
+
case VK_VENDOR_ID_INTEL:
|
1740
|
+
driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
|
1741
|
+
driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
|
1742
|
+
break;
|
1743
|
+
case VK_VENDOR_ID_NVIDIA:
|
1744
|
+
driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
|
1745
|
+
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
|
1746
|
+
driver_priorities[vk::DriverId::eMesaNvk] = 2;
|
1747
|
+
#endif
|
1748
|
+
break;
|
1749
|
+
}
|
1750
|
+
|
1751
|
+
if (driver_priorities.count(old_driver.driverID)) {
|
1752
|
+
old_priority = driver_priorities[old_driver.driverID];
|
1753
|
+
}
|
1754
|
+
if (driver_priorities.count(new_driver.driverID)) {
|
1755
|
+
new_priority = driver_priorities[new_driver.driverID];
|
1756
|
+
}
|
1757
|
+
|
1758
|
+
if (new_priority < old_priority) {
|
1759
|
+
auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
|
1760
|
+
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
|
1761
|
+
vk_instance.device_indices.push_back(i);
|
1762
|
+
|
1763
|
+
#ifdef GGML_VULKAN_DEBUG
|
1764
|
+
std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
|
1765
|
+
#endif
|
1766
|
+
}
|
1767
|
+
#ifdef GGML_VULKAN_DEBUG
|
1768
|
+
else {
|
1769
|
+
std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
|
1770
|
+
|
1771
|
+
}
|
1772
|
+
#endif
|
1773
|
+
}
|
1714
1774
|
}
|
1715
1775
|
}
|
1716
1776
|
|
@@ -2949,7 +3009,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2949
3009
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
2950
3010
|
|
2951
3011
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
2952
|
-
const uint64_t d_buf_offset = extra->offset;
|
3012
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
2953
3013
|
GGML_ASSERT(d_D != nullptr);
|
2954
3014
|
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
2955
3015
|
vk_buffer d_X;
|
@@ -2958,12 +3018,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2958
3018
|
uint64_t y_buf_offset = 0;
|
2959
3019
|
if (!src0_uma) {
|
2960
3020
|
d_Qx = extra_src0->buffer_gpu.lock();
|
2961
|
-
qx_buf_offset = extra_src0->offset;
|
3021
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
2962
3022
|
GGML_ASSERT(d_Qx != nullptr);
|
2963
3023
|
}
|
2964
3024
|
if (!src1_uma) {
|
2965
3025
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2966
|
-
qy_buf_offset = extra_src1->offset;
|
3026
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
2967
3027
|
GGML_ASSERT(d_Qy != nullptr);
|
2968
3028
|
}
|
2969
3029
|
if (qx_needs_dequant) {
|
@@ -3114,7 +3174,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3114
3174
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
3115
3175
|
|
3116
3176
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3117
|
-
const uint64_t d_buf_offset = extra->offset;
|
3177
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
3118
3178
|
GGML_ASSERT(d_D != nullptr);
|
3119
3179
|
vk_buffer d_X;
|
3120
3180
|
uint64_t x_buf_offset = 0;
|
@@ -3122,12 +3182,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3122
3182
|
uint64_t y_buf_offset = 0;
|
3123
3183
|
if(!src0_uma) {
|
3124
3184
|
d_Qx = extra_src0->buffer_gpu.lock();
|
3125
|
-
qx_buf_offset = extra_src0->offset;
|
3185
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
3126
3186
|
GGML_ASSERT(d_Qx != nullptr);
|
3127
3187
|
}
|
3128
3188
|
if(!src1_uma) {
|
3129
3189
|
d_Qy = extra_src1->buffer_gpu.lock();
|
3130
|
-
qy_buf_offset = extra_src1->offset;
|
3190
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
3131
3191
|
GGML_ASSERT(d_Qy != nullptr);
|
3132
3192
|
}
|
3133
3193
|
if (qx_needs_dequant) {
|
@@ -3246,14 +3306,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
3246
3306
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
3247
3307
|
|
3248
3308
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3249
|
-
const uint64_t d_buf_offset = extra->offset;
|
3309
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
3250
3310
|
GGML_ASSERT(d_D != nullptr);
|
3251
3311
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
3252
|
-
const uint64_t qx_buf_offset = extra_src0->offset;
|
3312
|
+
const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
|
3253
3313
|
GGML_ASSERT(d_Qx != nullptr);
|
3254
3314
|
if (!src1_uma) {
|
3255
3315
|
d_Qy = extra_src1->buffer_gpu.lock();
|
3256
|
-
qy_buf_offset = extra_src1->offset;
|
3316
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
3257
3317
|
GGML_ASSERT(d_Qx != nullptr);
|
3258
3318
|
}
|
3259
3319
|
|
@@ -3323,14 +3383,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
3323
3383
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
3324
3384
|
|
3325
3385
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3326
|
-
const uint64_t d_buf_offset = extra->offset;
|
3386
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
3327
3387
|
GGML_ASSERT(d_D != nullptr);
|
3328
3388
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
3329
|
-
const uint64_t qx_buf_offset = extra_src0->offset;
|
3389
|
+
const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
|
3330
3390
|
GGML_ASSERT(d_Qx != nullptr);
|
3331
3391
|
if (!src1_uma) {
|
3332
3392
|
d_Qy = extra_src1->buffer_gpu.lock();
|
3333
|
-
qy_buf_offset = extra_src1->offset;
|
3393
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
3334
3394
|
GGML_ASSERT(d_Qx != nullptr);
|
3335
3395
|
}
|
3336
3396
|
|
@@ -3459,7 +3519,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
3459
3519
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
3460
3520
|
|
3461
3521
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3462
|
-
const uint64_t d_buf_offset = extra->offset;
|
3522
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
3463
3523
|
GGML_ASSERT(d_D != nullptr);
|
3464
3524
|
vk_buffer d_X;
|
3465
3525
|
uint64_t x_buf_offset = 0;
|
@@ -3467,17 +3527,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
3467
3527
|
uint64_t y_buf_offset = 0;
|
3468
3528
|
if (!src0_uma) {
|
3469
3529
|
d_Qx = extra_src0->buffer_gpu.lock();
|
3470
|
-
qx_buf_offset = extra_src0->offset;
|
3530
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
3471
3531
|
GGML_ASSERT(d_Qx != nullptr);
|
3472
3532
|
}
|
3473
3533
|
if (!src1_uma) {
|
3474
3534
|
d_Qy = extra_src1->buffer_gpu.lock();
|
3475
|
-
qy_buf_offset = extra_src1->offset;
|
3535
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
3476
3536
|
GGML_ASSERT(d_Qy != nullptr);
|
3477
3537
|
}
|
3478
3538
|
if (!ids_uma) {
|
3479
3539
|
d_ids = extra_ids->buffer_gpu.lock();
|
3480
|
-
ids_buf_offset = extra_ids->offset;
|
3540
|
+
ids_buf_offset = extra_ids->offset + ids->view_offs;
|
3481
3541
|
GGML_ASSERT(d_ids != nullptr);
|
3482
3542
|
}
|
3483
3543
|
if (qx_needs_dequant) {
|
@@ -3636,7 +3696,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
3636
3696
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
3637
3697
|
|
3638
3698
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3639
|
-
const uint64_t d_buf_offset = extra->offset;
|
3699
|
+
const uint64_t d_buf_offset = extra->offset + dst->view_offs;
|
3640
3700
|
GGML_ASSERT(d_D != nullptr);
|
3641
3701
|
vk_buffer d_X;
|
3642
3702
|
uint64_t x_buf_offset = 0;
|
@@ -3644,17 +3704,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
3644
3704
|
uint64_t y_buf_offset = 0;
|
3645
3705
|
if(!src0_uma) {
|
3646
3706
|
d_Qx = extra_src0->buffer_gpu.lock();
|
3647
|
-
qx_buf_offset = extra_src0->offset;
|
3707
|
+
qx_buf_offset = extra_src0->offset + src0->view_offs;
|
3648
3708
|
GGML_ASSERT(d_Qx != nullptr);
|
3649
3709
|
}
|
3650
3710
|
if(!src1_uma) {
|
3651
3711
|
d_Qy = extra_src1->buffer_gpu.lock();
|
3652
|
-
qy_buf_offset = extra_src1->offset;
|
3712
|
+
qy_buf_offset = extra_src1->offset + src1->view_offs;
|
3653
3713
|
GGML_ASSERT(d_Qy != nullptr);
|
3654
3714
|
}
|
3655
3715
|
if(!ids_uma) {
|
3656
3716
|
d_ids = extra_ids->buffer_gpu.lock();
|
3657
|
-
ids_buf_offset = extra_ids->offset;
|
3717
|
+
ids_buf_offset = extra_ids->offset + ids->view_offs;
|
3658
3718
|
GGML_ASSERT(d_ids != nullptr);
|
3659
3719
|
}
|
3660
3720
|
if (qx_needs_dequant) {
|
@@ -3769,9 +3829,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
3769
3829
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
3770
3830
|
|
3771
3831
|
const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
|
3772
|
-
const uint64_t src_offset = extra_src0->offset;
|
3832
|
+
const uint64_t src_offset = extra_src0->offset + src0->view_offs;
|
3773
3833
|
vk_buffer dst_buf = extra->buffer_gpu.lock();
|
3774
|
-
const uint64_t dst_offset = extra->offset;
|
3834
|
+
const uint64_t dst_offset = extra->offset + dst->view_offs;
|
3775
3835
|
|
3776
3836
|
std::vector<vk::BufferCopy> copies;
|
3777
3837
|
|
@@ -3908,10 +3968,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
3908
3968
|
}
|
3909
3969
|
} else {
|
3910
3970
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
3911
|
-
return ctx->device->
|
3971
|
+
return ctx->device->pipeline_rope_norm_f32;
|
3912
3972
|
}
|
3913
3973
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
3914
|
-
return ctx->device->
|
3974
|
+
return ctx->device->pipeline_rope_norm_f16;
|
3915
3975
|
}
|
3916
3976
|
}
|
3917
3977
|
return nullptr;
|
@@ -4062,21 +4122,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4062
4122
|
}
|
4063
4123
|
|
4064
4124
|
GGML_ASSERT(d_D != nullptr);
|
4065
|
-
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
4125
|
+
uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
4066
4126
|
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
4067
4127
|
if(!src0_uma) {
|
4068
4128
|
d_X = extra_src0->buffer_gpu.lock();
|
4069
|
-
x_buf_offset = extra_src0->offset;
|
4129
|
+
x_buf_offset = extra_src0->offset + src0->view_offs;
|
4070
4130
|
GGML_ASSERT(d_X != nullptr);
|
4071
4131
|
}
|
4072
4132
|
if (use_src1 && !src1_uma) {
|
4073
4133
|
d_Y = extra_src1->buffer_gpu.lock();
|
4074
|
-
y_buf_offset = extra_src1->offset;
|
4134
|
+
y_buf_offset = extra_src1->offset + src1->view_offs;
|
4075
4135
|
GGML_ASSERT(d_Y != nullptr);
|
4076
4136
|
}
|
4077
4137
|
if (use_src2 && !src2_uma) {
|
4078
4138
|
d_Z = extra_src2->buffer_gpu.lock();
|
4079
|
-
z_buf_offset = extra_src2->offset;
|
4139
|
+
z_buf_offset = extra_src2->offset + src2->view_offs;
|
4080
4140
|
GGML_ASSERT(d_Z != nullptr);
|
4081
4141
|
}
|
4082
4142
|
|
@@ -4155,24 +4215,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4155
4215
|
ggml_vk_sync_buffers(subctx);
|
4156
4216
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4157
4217
|
} else if (op == GGML_OP_ROPE) {
|
4158
|
-
|
4159
|
-
|
4160
|
-
|
4161
|
-
|
4162
|
-
// Empty src2 is possible in rope, but the shader needs a buffer
|
4163
|
-
vk_subbuffer subbuf_z;
|
4164
|
-
if (use_src2) {
|
4165
|
-
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
4166
|
-
} else {
|
4167
|
-
subbuf_z = { d_X, 0, d_X->size };
|
4168
|
-
}
|
4169
|
-
|
4170
|
-
ggml_vk_sync_buffers(subctx);
|
4171
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4218
|
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
4219
|
+
vk_subbuffer subbuf_z;
|
4220
|
+
if (use_src2) {
|
4221
|
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
4172
4222
|
} else {
|
4173
|
-
|
4174
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4223
|
+
subbuf_z = { d_X, 0, d_X->size };
|
4175
4224
|
}
|
4225
|
+
|
4226
|
+
ggml_vk_sync_buffers(subctx);
|
4227
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4176
4228
|
} else if (use_src2) {
|
4177
4229
|
ggml_vk_sync_buffers(subctx);
|
4178
4230
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
@@ -4336,7 +4388,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
|
4336
4388
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
4337
4389
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
4338
4390
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
4339
|
-
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
4391
|
+
const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
4340
4392
|
|
4341
4393
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
4342
4394
|
(uint32_t)ggml_nelements(src0),
|
@@ -4394,7 +4446,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
4394
4446
|
|
4395
4447
|
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
4396
4448
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
4397
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
4449
|
+
// const int mode = ((int32_t *) dst->op_params)[2];
|
4398
4450
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
4399
4451
|
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
4400
4452
|
const float freq_base = ((float *) dst->op_params)[5];
|
@@ -4404,28 +4456,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
4404
4456
|
const float beta_fast = ((float *) dst->op_params)[9];
|
4405
4457
|
const float beta_slow = ((float *) dst->op_params)[10];
|
4406
4458
|
|
4407
|
-
const bool is_neox = mode & 2;
|
4408
|
-
|
4409
|
-
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
4410
|
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
4411
|
-
|
4412
4459
|
float corr_dims[2];
|
4413
4460
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
4414
4461
|
|
4415
|
-
|
4416
|
-
|
4417
|
-
|
4418
|
-
|
4419
|
-
|
4420
|
-
|
4421
|
-
|
4422
|
-
});
|
4423
|
-
} else {
|
4424
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
4425
|
-
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
4426
|
-
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
4427
|
-
});
|
4428
|
-
}
|
4462
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
4463
|
+
|
4464
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
4465
|
+
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
4466
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
|
4467
|
+
src2 != nullptr,
|
4468
|
+
});
|
4429
4469
|
}
|
4430
4470
|
|
4431
4471
|
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
@@ -5569,6 +5609,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5569
5609
|
const ggml_tensor * src2 = node->src[2];
|
5570
5610
|
|
5571
5611
|
switch (node->op) {
|
5612
|
+
// Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
|
5613
|
+
case GGML_OP_RESHAPE:
|
5614
|
+
case GGML_OP_VIEW:
|
5615
|
+
case GGML_OP_PERMUTE:
|
5616
|
+
case GGML_OP_TRANSPOSE:
|
5617
|
+
case GGML_OP_NONE:
|
5618
|
+
return;
|
5572
5619
|
case GGML_OP_UNARY:
|
5573
5620
|
switch (ggml_get_unary_op(node)) {
|
5574
5621
|
case GGML_UNARY_OP_SILU:
|
@@ -5590,10 +5637,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5590
5637
|
case GGML_OP_CPY:
|
5591
5638
|
case GGML_OP_CONT:
|
5592
5639
|
case GGML_OP_DUP:
|
5593
|
-
case GGML_OP_RESHAPE:
|
5594
|
-
case GGML_OP_VIEW:
|
5595
|
-
case GGML_OP_PERMUTE:
|
5596
|
-
case GGML_OP_TRANSPOSE:
|
5597
5640
|
case GGML_OP_NORM:
|
5598
5641
|
case GGML_OP_RMS_NORM:
|
5599
5642
|
case GGML_OP_DIAG_MASK_INF:
|
@@ -5601,7 +5644,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5601
5644
|
case GGML_OP_ROPE:
|
5602
5645
|
case GGML_OP_MUL_MAT:
|
5603
5646
|
case GGML_OP_MUL_MAT_ID:
|
5604
|
-
case GGML_OP_NONE:
|
5605
5647
|
case GGML_OP_ARGSORT:
|
5606
5648
|
case GGML_OP_SUM_ROWS:
|
5607
5649
|
break;
|
@@ -5654,12 +5696,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5654
5696
|
case GGML_OP_DUP:
|
5655
5697
|
ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
|
5656
5698
|
|
5657
|
-
break;
|
5658
|
-
case GGML_OP_RESHAPE:
|
5659
|
-
case GGML_OP_VIEW:
|
5660
|
-
case GGML_OP_PERMUTE:
|
5661
|
-
case GGML_OP_TRANSPOSE:
|
5662
|
-
case GGML_OP_NONE:
|
5663
5699
|
break;
|
5664
5700
|
case GGML_OP_NORM:
|
5665
5701
|
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
@@ -5712,7 +5748,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5712
5748
|
return;
|
5713
5749
|
}
|
5714
5750
|
|
5715
|
-
extra->ready = true;
|
5716
5751
|
extra->ctx_idx = ctx->compute_ctx->idx;
|
5717
5752
|
|
5718
5753
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
@@ -5796,8 +5831,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5796
5831
|
ggml_vk_check_results_0(ctx, params, tensor);
|
5797
5832
|
#endif
|
5798
5833
|
|
5799
|
-
GGML_ASSERT(extra->ready);
|
5800
|
-
|
5801
5834
|
vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
|
5802
5835
|
|
5803
5836
|
// Only run if ctx hasn't been submitted yet
|
@@ -5822,8 +5855,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5822
5855
|
subctx.out_memcpys.clear();
|
5823
5856
|
}
|
5824
5857
|
|
5825
|
-
extra->ready = false;
|
5826
|
-
|
5827
5858
|
return true;
|
5828
5859
|
}
|
5829
5860
|
|
@@ -5943,7 +5974,9 @@ struct ggml_backend_vk_buffer_context {
|
|
5943
5974
|
|
5944
5975
|
~ggml_backend_vk_buffer_context() {
|
5945
5976
|
ggml_vk_destroy_buffer(dev_buffer);
|
5946
|
-
|
5977
|
+
if (temp_tensor_extras != nullptr) {
|
5978
|
+
delete[] temp_tensor_extras;
|
5979
|
+
}
|
5947
5980
|
}
|
5948
5981
|
|
5949
5982
|
ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
|
@@ -5990,18 +6023,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
5990
6023
|
#endif
|
5991
6024
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
5992
6025
|
|
5993
|
-
|
5994
|
-
if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
|
6026
|
+
if (tensor->view_src != nullptr) {
|
5995
6027
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
5996
|
-
|
5997
|
-
extra
|
5998
|
-
extra->offset = extra_view->offset + tensor->view_offs;
|
6028
|
+
GGML_ASSERT(tensor->view_src->extra != nullptr);
|
6029
|
+
tensor->extra = tensor->view_src->extra;
|
5999
6030
|
} else {
|
6031
|
+
ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
|
6000
6032
|
extra->buffer_gpu = ctx->dev_buffer;
|
6001
6033
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
6034
|
+
tensor->extra = extra;
|
6002
6035
|
}
|
6003
|
-
|
6004
|
-
tensor->extra = extra;
|
6005
6036
|
}
|
6006
6037
|
|
6007
6038
|
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
@@ -6014,7 +6045,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
6014
6045
|
|
6015
6046
|
vk_buffer buf = extra->buffer_gpu.lock();
|
6016
6047
|
|
6017
|
-
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
|
6048
|
+
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
6018
6049
|
}
|
6019
6050
|
|
6020
6051
|
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
@@ -6027,7 +6058,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
6027
6058
|
|
6028
6059
|
vk_buffer buf = extra->buffer_gpu.lock();
|
6029
6060
|
|
6030
|
-
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
|
6061
|
+
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
6031
6062
|
}
|
6032
6063
|
|
6033
6064
|
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
@@ -6038,7 +6069,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|
6038
6069
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
6039
6070
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
6040
6071
|
|
6041
|
-
ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
6072
|
+
ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
|
6042
6073
|
|
6043
6074
|
return true;
|
6044
6075
|
}
|
@@ -6082,7 +6113,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
|
6082
6113
|
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
6083
6114
|
#endif
|
6084
6115
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
6085
|
-
|
6116
|
+
|
6117
|
+
vk_buffer dev_buffer = nullptr;
|
6118
|
+
try {
|
6119
|
+
dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
|
6120
|
+
} catch (const vk::SystemError& e) {
|
6121
|
+
return nullptr;
|
6122
|
+
}
|
6086
6123
|
|
6087
6124
|
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
|
6088
6125
|
|
@@ -6105,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
|
|
6105
6142
|
UNUSED(buft);
|
6106
6143
|
}
|
6107
6144
|
|
6108
|
-
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
6109
|
-
if (!ggml_backend_is_vk(backend)) {
|
6110
|
-
return false;
|
6111
|
-
}
|
6112
|
-
|
6113
|
-
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
6114
|
-
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6115
|
-
|
6116
|
-
return buft_ctx->ctx->idx == ctx->idx;
|
6117
|
-
}
|
6118
|
-
|
6119
6145
|
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
6120
6146
|
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
6121
6147
|
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
6122
6148
|
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
6123
6149
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
6124
6150
|
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
6125
|
-
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
|
6126
6151
|
/* .is_host = */ NULL,
|
6127
6152
|
};
|
6128
6153
|
|
@@ -6198,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
6198
6223
|
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
6199
6224
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
6200
6225
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
6201
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
6202
6226
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
6203
6227
|
},
|
6204
6228
|
/* .context = */ nullptr,
|
@@ -6264,7 +6288,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
6264
6288
|
|
6265
6289
|
vk_buffer buf = extra->buffer_gpu.lock();
|
6266
6290
|
|
6267
|
-
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
|
6291
|
+
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
6268
6292
|
}
|
6269
6293
|
|
6270
6294
|
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
@@ -6284,7 +6308,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
6284
6308
|
|
6285
6309
|
vk_buffer buf = extra->buffer_gpu.lock();
|
6286
6310
|
|
6287
|
-
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
|
6311
|
+
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
|
6288
6312
|
}
|
6289
6313
|
|
6290
6314
|
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
@@ -6305,7 +6329,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
6305
6329
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
6306
6330
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
6307
6331
|
|
6308
|
-
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
6332
|
+
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
|
6309
6333
|
return true;
|
6310
6334
|
}
|
6311
6335
|
|
@@ -6402,7 +6426,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
6402
6426
|
case GGML_UNARY_OP_GELU:
|
6403
6427
|
case GGML_UNARY_OP_SILU:
|
6404
6428
|
case GGML_UNARY_OP_RELU:
|
6405
|
-
return
|
6429
|
+
return ggml_is_contiguous(op->src[0]);
|
6406
6430
|
default:
|
6407
6431
|
return false;
|
6408
6432
|
}
|
@@ -6478,11 +6502,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
6478
6502
|
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
6479
6503
|
// } break;
|
6480
6504
|
case GGML_OP_ROPE:
|
6481
|
-
|
6482
|
-
const int mode = ((const int32_t *) op->op_params)[2];
|
6483
|
-
|
6484
|
-
return true;
|
6485
|
-
} break;
|
6505
|
+
return ggml_is_contiguous(op->src[0]);
|
6486
6506
|
case GGML_OP_NONE:
|
6487
6507
|
case GGML_OP_RESHAPE:
|
6488
6508
|
case GGML_OP_VIEW:
|
@@ -6518,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
|
|
6518
6538
|
UNUSED(backend);
|
6519
6539
|
}
|
6520
6540
|
|
6541
|
+
GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
6542
|
+
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
6543
|
+
return false;
|
6544
|
+
}
|
6545
|
+
|
6546
|
+
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
6547
|
+
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6548
|
+
|
6549
|
+
return buft_ctx->ctx->idx == ctx->idx;
|
6550
|
+
}
|
6551
|
+
|
6521
6552
|
// TODO: enable async and synchronize
|
6522
6553
|
static ggml_backend_i ggml_backend_vk_interface = {
|
6523
6554
|
/* .get_name = */ ggml_backend_vk_name,
|
@@ -6529,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
6529
6560
|
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
6530
6561
|
/* .graph_plan_create = */ NULL,
|
6531
6562
|
/* .graph_plan_free = */ NULL,
|
6563
|
+
/* .graph_plan_update = */ NULL,
|
6532
6564
|
/* .graph_plan_compute = */ NULL,
|
6533
6565
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
6534
6566
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
6567
|
+
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
6535
6568
|
/* .offload_op = */ ggml_backend_vk_offload_op,
|
6536
6569
|
/* .event_new = */ NULL,
|
6537
6570
|
/* .event_free = */ NULL,
|
@@ -6725,7 +6758,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
6725
6758
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6726
6759
|
|
6727
6760
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6728
|
-
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
6761
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
6729
6762
|
}
|
6730
6763
|
|
6731
6764
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
@@ -6809,7 +6842,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6809
6842
|
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
6810
6843
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6811
6844
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6812
|
-
uint64_t offset = extra->offset;
|
6845
|
+
uint64_t offset = extra->offset + src0->view_offs;
|
6813
6846
|
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
6814
6847
|
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
6815
6848
|
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
@@ -6851,7 +6884,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6851
6884
|
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
6852
6885
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6853
6886
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6854
|
-
uint64_t offset = extra->offset;
|
6887
|
+
uint64_t offset = extra->offset + src1->view_offs;
|
6855
6888
|
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
6856
6889
|
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
6857
6890
|
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
@@ -6909,7 +6942,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6909
6942
|
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
6910
6943
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
6911
6944
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6912
|
-
uint64_t offset = extra->offset;
|
6945
|
+
uint64_t offset = extra->offset + src2->view_offs;
|
6913
6946
|
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
6914
6947
|
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
6915
6948
|
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
@@ -7092,11 +7125,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7092
7125
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7093
7126
|
|
7094
7127
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
7095
|
-
if (extra->offset + tensor_size >= buffer_gpu->size) {
|
7096
|
-
tensor_size = buffer_gpu->size - (extra->offset);
|
7128
|
+
if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
|
7129
|
+
tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
|
7097
7130
|
}
|
7098
7131
|
|
7099
|
-
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
|
7132
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
7100
7133
|
}
|
7101
7134
|
|
7102
7135
|
float first_error_result = -1.0f;
|