llama_cpp 0.15.2 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
@@ -114,6 +114,7 @@ struct vk_device {
|
|
114
114
|
size_t idx;
|
115
115
|
|
116
116
|
vk_matmul_pipeline pipeline_matmul_f32;
|
117
|
+
vk_matmul_pipeline pipeline_matmul_f32_f16;
|
117
118
|
vk_matmul_pipeline pipeline_matmul_f16;
|
118
119
|
vk_matmul_pipeline pipeline_matmul_f16_f32;
|
119
120
|
vk_pipeline pipeline_matmul_split_k_reduce;
|
@@ -289,12 +290,12 @@ struct vk_op_rope_neox_push_constants {
|
|
289
290
|
float corr_dims[4];
|
290
291
|
float theta_scale;
|
291
292
|
float inv_ndims;
|
293
|
+
uint32_t has_freq_facs;
|
292
294
|
};
|
293
295
|
|
294
296
|
struct vk_op_soft_max_push_constants {
|
295
297
|
uint32_t KX;
|
296
298
|
uint32_t KY;
|
297
|
-
uint32_t KZ;
|
298
299
|
float scale;
|
299
300
|
float max_bias;
|
300
301
|
float m0;
|
@@ -304,7 +305,8 @@ struct vk_op_soft_max_push_constants {
|
|
304
305
|
|
305
306
|
struct vk_op_argsort_push_constants {
|
306
307
|
uint32_t ncols;
|
307
|
-
|
308
|
+
uint32_t ncols_pad;
|
309
|
+
int32_t order;
|
308
310
|
};
|
309
311
|
|
310
312
|
// Allow pre-recording command buffers
|
@@ -375,13 +377,12 @@ struct ggml_backend_vk_context {
|
|
375
377
|
vk_context * compute_ctx;
|
376
378
|
vk_context * transfer_ctx;
|
377
379
|
|
378
|
-
bool disable;
|
379
380
|
bool initialized;
|
380
381
|
|
381
382
|
size_t idx;
|
382
383
|
};
|
383
384
|
|
384
|
-
struct
|
385
|
+
struct vk_instance_t {
|
385
386
|
vk::Instance instance;
|
386
387
|
|
387
388
|
std::vector<size_t> device_indices;
|
@@ -423,7 +424,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
423
424
|
typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
424
425
|
|
425
426
|
static bool vk_instance_initialized = false;
|
426
|
-
static
|
427
|
+
static vk_instance_t vk_instance;
|
427
428
|
|
428
429
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
429
430
|
|
@@ -1013,6 +1014,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1013
1014
|
uint32_t s_align = 32;
|
1014
1015
|
|
1015
1016
|
ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
1017
|
+
ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
1016
1018
|
ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
1017
1019
|
ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
1018
1020
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
@@ -1048,6 +1050,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1048
1050
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1049
1051
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1050
1052
|
|
1053
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1054
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1055
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1056
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1057
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1058
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1059
|
+
|
1051
1060
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1052
1061
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1053
1062
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
@@ -1230,6 +1239,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1230
1239
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1231
1240
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1232
1241
|
|
1242
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1243
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1244
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1245
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1246
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1247
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1248
|
+
|
1233
1249
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1234
1250
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1235
1251
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
@@ -1501,14 +1517,14 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1501
1517
|
|
1502
1518
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
|
1503
1519
|
|
1504
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main",
|
1505
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main",
|
1520
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1521
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1506
1522
|
|
1507
1523
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1508
1524
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1509
1525
|
|
1510
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main",
|
1511
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main",
|
1526
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
1527
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
1512
1528
|
|
1513
1529
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
1514
1530
|
}
|
@@ -1859,7 +1875,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1859
1875
|
ctx->compute_ctx = nullptr;
|
1860
1876
|
ctx->transfer_ctx = nullptr;
|
1861
1877
|
|
1862
|
-
ctx->disable = false;
|
1863
1878
|
ctx->initialized = true;
|
1864
1879
|
|
1865
1880
|
ctx->idx = idx;
|
@@ -1903,6 +1918,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
1903
1918
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
1904
1919
|
return ctx->device->pipeline_matmul_f32;
|
1905
1920
|
}
|
1921
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
1922
|
+
return ctx->device->pipeline_matmul_f32_f16;
|
1923
|
+
}
|
1906
1924
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
1907
1925
|
return ctx->device->pipeline_matmul_f16_f32;
|
1908
1926
|
}
|
@@ -2722,7 +2740,7 @@ static void ggml_vk_matmul(
|
|
2722
2740
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
2723
2741
|
uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
|
2724
2742
|
#ifdef GGML_VULKAN_DEBUG
|
2725
|
-
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
2743
|
+
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
2726
2744
|
#endif
|
2727
2745
|
ggml_vk_sync_buffers(subctx);
|
2728
2746
|
if (split_k == 1) {
|
@@ -2792,7 +2810,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
|
|
2792
2810
|
|
2793
2811
|
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
2794
2812
|
#ifdef GGML_VULKAN_DEBUG
|
2795
|
-
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ",
|
2813
|
+
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
2796
2814
|
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
|
2797
2815
|
#endif
|
2798
2816
|
const int tensor_type_size = ggml_type_size(tensor->type);
|
@@ -2812,9 +2830,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
|
|
2812
2830
|
|
2813
2831
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2814
2832
|
#ifdef GGML_VULKAN_DEBUG
|
2815
|
-
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
2816
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
2817
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
2833
|
+
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
2834
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
2835
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
2818
2836
|
#endif
|
2819
2837
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
2820
2838
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
@@ -2982,19 +3000,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2982
3000
|
ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
|
2983
3001
|
0, 0, 0, 0, 1
|
2984
3002
|
); // NOLINT
|
2985
|
-
|
2986
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2987
|
-
// copy dst to host
|
2988
|
-
float * d = (float *) ((char *) dst->data);
|
2989
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
2990
|
-
}
|
2991
3003
|
}
|
2992
3004
|
|
2993
3005
|
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2994
3006
|
#ifdef GGML_VULKAN_DEBUG
|
2995
|
-
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
2996
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
2997
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3007
|
+
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3008
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3009
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
2998
3010
|
#endif
|
2999
3011
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
3000
3012
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
@@ -3147,12 +3159,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3147
3159
|
|
3148
3160
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3149
3161
|
#ifdef GGML_VULKAN_DEBUG
|
3150
|
-
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
3151
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
3152
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3162
|
+
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3163
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3164
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
3153
3165
|
#endif
|
3154
3166
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
3155
|
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
3156
3167
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
3157
3168
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
3158
3169
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -3217,25 +3228,17 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
3217
3228
|
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
3218
3229
|
ggml_vk_sync_buffers(subctx);
|
3219
3230
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
3220
|
-
|
3221
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3222
|
-
// copy dst to host
|
3223
|
-
float * d = (float *) dst->data;
|
3224
|
-
ggml_vk_sync_buffers(subctx);
|
3225
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
|
3226
|
-
}
|
3227
3231
|
}
|
3228
3232
|
|
3229
3233
|
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3230
3234
|
#ifdef GGML_VULKAN_DEBUG
|
3231
|
-
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
3232
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
3233
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3235
|
+
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3236
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3237
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
3234
3238
|
#endif
|
3235
3239
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
3236
3240
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
3237
3241
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
3238
|
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
3239
3242
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
3240
3243
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
3241
3244
|
|
@@ -3302,26 +3305,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
3302
3305
|
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
3303
3306
|
ggml_vk_sync_buffers(subctx);
|
3304
3307
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
3305
|
-
|
3306
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3307
|
-
// copy dst to host
|
3308
|
-
float * d = (float *) dst->data;
|
3309
|
-
ggml_vk_sync_buffers(subctx);
|
3310
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
|
3311
|
-
}
|
3312
|
-
}
|
3313
|
-
|
3314
|
-
static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
|
3315
|
-
const uint64_t ne10 = src1->ne[0];
|
3316
|
-
|
3317
|
-
const uint64_t ne0 = dst->ne[0];
|
3318
|
-
const uint64_t ne1 = dst->ne[1];
|
3319
|
-
|
3320
|
-
// TODO: find the optimal values for these
|
3321
|
-
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
3322
|
-
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
3323
|
-
dst->type == GGML_TYPE_F32 &&
|
3324
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
|
3325
3308
|
}
|
3326
3309
|
|
3327
3310
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3711,8 +3694,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
3711
3694
|
// TODO: support for transposed / permuted tensors
|
3712
3695
|
GGML_ASSERT(nb0 == sizeof(float));
|
3713
3696
|
GGML_ASSERT(nb00 == sizeof(float));
|
3714
|
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
3715
|
-
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
3716
3697
|
|
3717
3698
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
3718
3699
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
@@ -3834,7 +3815,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
3834
3815
|
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
3835
3816
|
return ctx->device->pipeline_soft_max_f32;
|
3836
3817
|
}
|
3837
|
-
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 &&
|
3818
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
3838
3819
|
return ctx->device->pipeline_soft_max_f32_f16;
|
3839
3820
|
}
|
3840
3821
|
return nullptr;
|
@@ -3873,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
3873
3854
|
default:
|
3874
3855
|
return nullptr;
|
3875
3856
|
}
|
3857
|
+
|
3858
|
+
GGML_UNUSED(src2);
|
3876
3859
|
}
|
3877
3860
|
|
3878
3861
|
static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
@@ -3902,14 +3885,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
3902
3885
|
template<typename PC>
|
3903
3886
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
3904
3887
|
#ifdef GGML_VULKAN_DEBUG
|
3905
|
-
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
3888
|
+
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3906
3889
|
if (src1 != nullptr) {
|
3907
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
3890
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3908
3891
|
}
|
3909
3892
|
if (src2 != nullptr) {
|
3910
|
-
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ",
|
3893
|
+
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
3911
3894
|
}
|
3912
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3895
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
3913
3896
|
#endif
|
3914
3897
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
3915
3898
|
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
@@ -3919,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3919
3902
|
const uint64_t ne02 = src0->ne[2];
|
3920
3903
|
const uint64_t ne03 = src0->ne[3];
|
3921
3904
|
const uint64_t ne0 = ne00 * ne01;
|
3905
|
+
|
3922
3906
|
const bool use_src1 = src1 != nullptr;
|
3923
3907
|
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
3924
3908
|
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
@@ -3926,11 +3910,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3926
3910
|
const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
|
3927
3911
|
const uint64_t ne1 = ne10 * ne11;
|
3928
3912
|
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
3929
|
-
const uint64_t nb2 = dst->nb[2];
|
3930
|
-
const uint64_t nb3 = dst->nb[3];
|
3931
3913
|
|
3932
3914
|
const bool use_src2 = src2 != nullptr;
|
3933
|
-
const uint64_t
|
3915
|
+
const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
|
3916
|
+
const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
|
3917
|
+
const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
|
3918
|
+
const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
|
3919
|
+
const uint64_t ne2 = ne20 * ne21;
|
3934
3920
|
|
3935
3921
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
3936
3922
|
ggml_vk_func_t op_func;
|
@@ -3976,7 +3962,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3976
3962
|
src1_uma = d_Y != nullptr;
|
3977
3963
|
}
|
3978
3964
|
if (use_src2) {
|
3979
|
-
ggml_vk_host_get(ctx,
|
3965
|
+
ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
|
3980
3966
|
src2_uma = d_Z != nullptr;
|
3981
3967
|
}
|
3982
3968
|
}
|
@@ -3989,7 +3975,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3989
3975
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3990
3976
|
|
3991
3977
|
// Workaround for tiny tensor inputs on ROPE
|
3992
|
-
if (use_src1 &&
|
3978
|
+
if (use_src1 && y_sz > d_D->size) {
|
3993
3979
|
y_sz = VK_WHOLE_SIZE;
|
3994
3980
|
}
|
3995
3981
|
|
@@ -4006,7 +3992,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4006
3992
|
y_buf_offset = extra_src1->offset;
|
4007
3993
|
GGML_ASSERT(d_Y != nullptr);
|
4008
3994
|
}
|
4009
|
-
|
4010
3995
|
if (use_src2 && !src2_uma) {
|
4011
3996
|
d_Z = extra_src2->buffer_gpu.lock();
|
4012
3997
|
z_buf_offset = extra_src2->offset;
|
@@ -4016,6 +4001,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4016
4001
|
if (op_supports_incontiguous) {
|
4017
4002
|
x_sz = ggml_nbytes(src0);
|
4018
4003
|
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
4004
|
+
z_sz = use_src2 ? ggml_nbytes(src2) : 0;
|
4019
4005
|
d_sz = ggml_nbytes(dst);
|
4020
4006
|
|
4021
4007
|
if (x_buf_offset + x_sz >= d_X->size) {
|
@@ -4024,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4024
4010
|
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
4025
4011
|
y_sz = VK_WHOLE_SIZE;
|
4026
4012
|
}
|
4013
|
+
if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
|
4014
|
+
z_sz = VK_WHOLE_SIZE;
|
4015
|
+
}
|
4027
4016
|
if (d_buf_offset + d_sz >= d_D->size) {
|
4028
4017
|
d_sz = VK_WHOLE_SIZE;
|
4029
4018
|
}
|
@@ -4046,7 +4035,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4046
4035
|
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
4047
4036
|
break;
|
4048
4037
|
case GGML_OP_GET_ROWS:
|
4049
|
-
elements = {
|
4038
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
4039
|
+
break;
|
4040
|
+
case GGML_OP_ARGSORT:
|
4041
|
+
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
|
4050
4042
|
break;
|
4051
4043
|
default:
|
4052
4044
|
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
@@ -4060,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4060
4052
|
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
4061
4053
|
y_sz *= ne12 * ne13;
|
4062
4054
|
}
|
4055
|
+
if (use_src2 && z_sz != VK_WHOLE_SIZE) {
|
4056
|
+
z_sz *= ne22 * ne23;
|
4057
|
+
}
|
4063
4058
|
if (d_sz != VK_WHOLE_SIZE) {
|
4064
4059
|
d_sz *= ne02 * ne03;
|
4065
4060
|
}
|
4066
4061
|
}
|
4067
4062
|
|
4068
4063
|
if (op == GGML_OP_SOFT_MAX) {
|
4069
|
-
// Empty src1
|
4064
|
+
// Empty src1 is possible in soft_max, but the shader needs a buffer
|
4070
4065
|
vk_subbuffer subbuf_y;
|
4071
4066
|
if (use_src1) {
|
4072
4067
|
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
@@ -4074,15 +4069,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4074
4069
|
subbuf_y = { d_X, 0, d_X->size };
|
4075
4070
|
}
|
4076
4071
|
|
4077
|
-
|
4078
|
-
|
4079
|
-
|
4072
|
+
ggml_vk_sync_buffers(subctx);
|
4073
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4074
|
+
} else if (op == GGML_OP_ROPE) {
|
4075
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
4076
|
+
const bool is_neox = mode & 2;
|
4077
|
+
|
4078
|
+
if (is_neox) {
|
4079
|
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
4080
|
+
vk_subbuffer subbuf_z;
|
4081
|
+
if (use_src2) {
|
4082
|
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
4083
|
+
} else {
|
4084
|
+
subbuf_z = { d_X, 0, d_X->size };
|
4085
|
+
}
|
4086
|
+
|
4087
|
+
ggml_vk_sync_buffers(subctx);
|
4088
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4080
4089
|
} else {
|
4081
|
-
|
4090
|
+
ggml_vk_sync_buffers(subctx);
|
4091
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4082
4092
|
}
|
4083
|
-
|
4093
|
+
} else if (use_src2) {
|
4084
4094
|
ggml_vk_sync_buffers(subctx);
|
4085
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz },
|
4095
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4086
4096
|
} else if (use_src1) {
|
4087
4097
|
ggml_vk_sync_buffers(subctx);
|
4088
4098
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
@@ -4090,22 +4100,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4090
4100
|
ggml_vk_sync_buffers(subctx);
|
4091
4101
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4092
4102
|
}
|
4093
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
|
4094
|
-
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
4095
|
-
} else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
|
4096
|
-
// copy dst to host
|
4097
|
-
float * d = (float *) dst->data;
|
4098
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
4099
|
-
}
|
4100
4103
|
} else {
|
4101
4104
|
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
4105
|
+
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
4106
|
+
GGML_ASSERT(!use_src2);
|
4102
4107
|
|
4103
4108
|
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
|
4104
4109
|
|
4105
4110
|
switch (dst->op) {
|
4106
4111
|
case GGML_OP_NORM:
|
4107
4112
|
case GGML_OP_RMS_NORM:
|
4108
|
-
case GGML_OP_SOFT_MAX:
|
4109
4113
|
elements = { (uint32_t)ne01, 1, 1 };
|
4110
4114
|
break;
|
4111
4115
|
case GGML_OP_DIAG_MASK_INF:
|
@@ -4135,10 +4139,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4135
4139
|
ggml_vk_sync_buffers(subctx);
|
4136
4140
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4137
4141
|
}
|
4138
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
4139
|
-
// copy dst to host
|
4140
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
4141
|
-
}
|
4142
4142
|
}
|
4143
4143
|
}
|
4144
4144
|
}
|
@@ -4269,7 +4269,7 @@ static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * su
|
|
4269
4269
|
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
4270
4270
|
}
|
4271
4271
|
|
4272
|
-
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1,
|
4272
|
+
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
4273
4273
|
float * op_params = (float *)dst->op_params;
|
4274
4274
|
|
4275
4275
|
float scale = op_params[0];
|
@@ -4285,20 +4285,16 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
4285
4285
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
4286
4286
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
4287
4287
|
|
4288
|
-
|
4289
|
-
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
|
4290
|
-
|
4291
|
-
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
|
4288
|
+
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
|
4292
4289
|
ncols,
|
4293
4290
|
src1 != nullptr ? nrows_y : (uint32_t)0,
|
4294
|
-
src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
|
4295
4291
|
scale, max_bias,
|
4296
4292
|
m0, m1,
|
4297
4293
|
n_head_log2,
|
4298
4294
|
});
|
4299
4295
|
}
|
4300
4296
|
|
4301
|
-
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
4297
|
+
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
4302
4298
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
4303
4299
|
const int mode = ((int32_t *) dst->op_params)[2];
|
4304
4300
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
@@ -4321,15 +4317,40 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
4321
4317
|
if (is_neox) {
|
4322
4318
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
4323
4319
|
const float inv_ndims = -1.0f / n_dims;
|
4324
|
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1,
|
4320
|
+
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
4321
|
+
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
4322
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
|
4323
|
+
src2 != nullptr,
|
4324
|
+
});
|
4325
4325
|
} else {
|
4326
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1,
|
4326
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
4327
|
+
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
4328
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
4329
|
+
});
|
4327
4330
|
}
|
4328
4331
|
}
|
4329
4332
|
|
4330
4333
|
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
4331
4334
|
int32_t * op_params = (int32_t *)dst->op_params;
|
4332
|
-
|
4335
|
+
|
4336
|
+
uint32_t ncols = src0->ne[0];
|
4337
|
+
|
4338
|
+
uint32_t ncols_pad = 1;
|
4339
|
+
while (ncols_pad < ncols) {
|
4340
|
+
ncols_pad *= 2;
|
4341
|
+
}
|
4342
|
+
|
4343
|
+
GGML_ASSERT(ncols_pad <= 1024);
|
4344
|
+
|
4345
|
+
std::cerr << "ncols=" << ncols << " ncols_pad=" << ncols_pad << " ascending=" << op_params[0] << std::endl;
|
4346
|
+
|
4347
|
+
std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
|
4348
|
+
|
4349
|
+
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
|
4350
|
+
ncols,
|
4351
|
+
ncols_pad,
|
4352
|
+
op_params[0],
|
4353
|
+
});
|
4333
4354
|
}
|
4334
4355
|
|
4335
4356
|
#ifdef GGML_VULKAN_RUN_TESTS
|
@@ -4381,6 +4402,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4381
4402
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4382
4403
|
p = ctx->device->pipeline_matmul_f32->a_s;
|
4383
4404
|
shname = "F32_ALIGNED_S";
|
4405
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4406
|
+
p = ctx->device->pipeline_matmul_f32_f16->a_s;
|
4407
|
+
shname = "F32_F16_ALIGNED_S";
|
4384
4408
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4385
4409
|
p = ctx->device->pipeline_matmul_f16_f32->a_s;
|
4386
4410
|
shname = "F16_F32_ALIGNED_S";
|
@@ -4394,6 +4418,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4394
4418
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4395
4419
|
p = ctx->device->pipeline_matmul_f32->a_m;
|
4396
4420
|
shname = "F32_ALIGNED_M";
|
4421
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4422
|
+
p = ctx->device->pipeline_matmul_f32_f16->a_m;
|
4423
|
+
shname = "F32_F16_ALIGNED_M";
|
4397
4424
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4398
4425
|
p = ctx->device->pipeline_matmul_f16_f32->a_m;
|
4399
4426
|
shname = "F16_F32_ALIGNED_M";
|
@@ -4407,6 +4434,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4407
4434
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4408
4435
|
p = ctx->device->pipeline_matmul_f32->a_l;
|
4409
4436
|
shname = "F32_ALIGNED_L";
|
4437
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4438
|
+
p = ctx->device->pipeline_matmul_f32_f16->a_l;
|
4439
|
+
shname = "F32_F16_ALIGNED_L";
|
4410
4440
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4411
4441
|
p = ctx->device->pipeline_matmul_f16_f32->a_l;
|
4412
4442
|
shname = "F16_F32_ALIGNED_L";
|
@@ -4427,6 +4457,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4427
4457
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4428
4458
|
p = ctx->device->pipeline_matmul_f32->s;
|
4429
4459
|
shname = "F32_S";
|
4460
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4461
|
+
p = ctx->device->pipeline_matmul_f32_f16->s;
|
4462
|
+
shname = "F32_F16_S";
|
4430
4463
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4431
4464
|
p = ctx->device->pipeline_matmul_f16_f32->s;
|
4432
4465
|
shname = "F16_F32_S";
|
@@ -4438,6 +4471,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4438
4471
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4439
4472
|
p = ctx->device->pipeline_matmul_f32->m;
|
4440
4473
|
shname = "F32_M";
|
4474
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4475
|
+
p = ctx->device->pipeline_matmul_f32_f16->m;
|
4476
|
+
shname = "F32_F16_M";
|
4441
4477
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4442
4478
|
p = ctx->device->pipeline_matmul_f16_f32->m;
|
4443
4479
|
shname = "F16_F32_M";
|
@@ -4449,6 +4485,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4449
4485
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4450
4486
|
p = ctx->device->pipeline_matmul_f32->l;
|
4451
4487
|
shname = "F32_L";
|
4488
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4489
|
+
p = ctx->device->pipeline_matmul_f32_f16->l;
|
4490
|
+
shname = "F32_F16_L";
|
4452
4491
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4453
4492
|
p = ctx->device->pipeline_matmul_f16_f32->l;
|
4454
4493
|
shname = "F16_F32_L";
|
@@ -4561,15 +4600,11 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4561
4600
|
src1_ggml->data = y;
|
4562
4601
|
tensor_ggml->data = d_chk;
|
4563
4602
|
|
4564
|
-
ctx->disable = true;
|
4565
|
-
|
4566
4603
|
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
4567
4604
|
ggml_build_forward_expand(cgraph, tensor_ggml);
|
4568
4605
|
|
4569
4606
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
4570
4607
|
|
4571
|
-
ctx->disable = false;
|
4572
|
-
|
4573
4608
|
ggml_free(ggml_ctx);
|
4574
4609
|
|
4575
4610
|
double avg_err = 0.0;
|
@@ -5049,15 +5084,11 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
5049
5084
|
src1_ggml->data = y;
|
5050
5085
|
tensor_ggml->data = d_chk;
|
5051
5086
|
|
5052
|
-
ctx->disable = true;
|
5053
|
-
|
5054
5087
|
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
5055
5088
|
ggml_build_forward_expand(cgraph, tensor_ggml);
|
5056
5089
|
|
5057
5090
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
5058
5091
|
|
5059
|
-
ctx->disable = false;
|
5060
|
-
|
5061
5092
|
ggml_free(ggml_ctx);
|
5062
5093
|
|
5063
5094
|
double avg_err = 0.0;
|
@@ -5134,12 +5165,12 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5134
5165
|
#ifdef GGML_VULKAN_DEBUG
|
5135
5166
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
5136
5167
|
#endif
|
5137
|
-
|
5168
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5169
|
+
|
5170
|
+
if (extra == nullptr) {
|
5138
5171
|
return;
|
5139
5172
|
}
|
5140
5173
|
|
5141
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5142
|
-
|
5143
5174
|
ggml_tensor * src0 = node->src[0];
|
5144
5175
|
ggml_tensor * src1 = node->src[1];
|
5145
5176
|
|
@@ -5244,9 +5275,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5244
5275
|
}
|
5245
5276
|
|
5246
5277
|
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
5247
|
-
if (ctx->disable) {
|
5248
|
-
return;
|
5249
|
-
}
|
5250
5278
|
#ifdef GGML_VULKAN_DEBUG
|
5251
5279
|
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
5252
5280
|
#endif
|
@@ -5420,7 +5448,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5420
5448
|
}
|
5421
5449
|
|
5422
5450
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
5423
|
-
|
5451
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5452
|
+
|
5453
|
+
if (ggml_is_empty(node) || extra == nullptr) {
|
5424
5454
|
return;
|
5425
5455
|
}
|
5426
5456
|
|
@@ -5434,8 +5464,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5434
5464
|
const ggml_tensor * src1 = node->src[1];
|
5435
5465
|
const ggml_tensor * src2 = node->src[2];
|
5436
5466
|
|
5437
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5438
|
-
|
5439
5467
|
switch (node->op) {
|
5440
5468
|
case GGML_OP_UNARY:
|
5441
5469
|
switch (ggml_get_unary_op(node)) {
|
@@ -5547,11 +5575,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5547
5575
|
|
5548
5576
|
break;
|
5549
5577
|
case GGML_OP_SOFT_MAX:
|
5550
|
-
ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1,
|
5578
|
+
ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
|
5551
5579
|
|
5552
5580
|
break;
|
5553
5581
|
case GGML_OP_ROPE:
|
5554
|
-
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
|
5582
|
+
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
|
5555
5583
|
|
5556
5584
|
break;
|
5557
5585
|
case GGML_OP_ARGSORT:
|
@@ -5580,7 +5608,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5580
5608
|
last_node = true;
|
5581
5609
|
#endif
|
5582
5610
|
|
5583
|
-
if (
|
5611
|
+
if (last_node) {
|
5584
5612
|
ggml_vk_ctx_end(ctx->compute_ctx);
|
5585
5613
|
ctx->compute_ctx->exit_tensor = node;
|
5586
5614
|
ctx->compute_ctx = nullptr;
|
@@ -5588,10 +5616,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5588
5616
|
}
|
5589
5617
|
|
5590
5618
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
5591
|
-
if (ctx->disable) {
|
5592
|
-
return false;
|
5593
|
-
}
|
5594
|
-
|
5595
5619
|
ggml_tensor_extra_gpu * extra = nullptr;
|
5596
5620
|
|
5597
5621
|
switch (tensor->op) {
|
@@ -5650,7 +5674,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5650
5674
|
}
|
5651
5675
|
|
5652
5676
|
#ifdef GGML_VULKAN_DEBUG
|
5653
|
-
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ",
|
5677
|
+
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
|
5654
5678
|
#endif
|
5655
5679
|
|
5656
5680
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
@@ -5690,9 +5714,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5690
5714
|
|
5691
5715
|
// Clean up after graph processing is done
|
5692
5716
|
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
5693
|
-
if (ctx->disable) {
|
5694
|
-
return;
|
5695
|
-
}
|
5696
5717
|
#ifdef GGML_VULKAN_DEBUG
|
5697
5718
|
std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
|
5698
5719
|
#endif
|
@@ -5865,7 +5886,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
5865
5886
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
5866
5887
|
}
|
5867
5888
|
|
5868
|
-
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
5869
5889
|
tensor->extra = extra;
|
5870
5890
|
}
|
5871
5891
|
|
@@ -5873,8 +5893,6 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
5873
5893
|
#ifdef GGML_VULKAN_DEBUG
|
5874
5894
|
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
5875
5895
|
#endif
|
5876
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
5877
|
-
|
5878
5896
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
5879
5897
|
|
5880
5898
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -5888,8 +5906,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
5888
5906
|
#ifdef GGML_VULKAN_DEBUG
|
5889
5907
|
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
5890
5908
|
#endif
|
5891
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
5892
|
-
|
5893
5909
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
5894
5910
|
|
5895
5911
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -6032,6 +6048,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
|
|
6032
6048
|
#ifdef GGML_VULKAN_DEBUG
|
6033
6049
|
std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
6034
6050
|
#endif
|
6051
|
+
size += 32; // Behave like the CPU buffer type
|
6035
6052
|
void * ptr = nullptr;
|
6036
6053
|
try {
|
6037
6054
|
ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
|
@@ -6119,7 +6136,6 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
6119
6136
|
#endif
|
6120
6137
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6121
6138
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6122
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
6123
6139
|
|
6124
6140
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6125
6141
|
|
@@ -6140,7 +6156,6 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
6140
6156
|
#endif
|
6141
6157
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6142
6158
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6143
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
6144
6159
|
|
6145
6160
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6146
6161
|
|
@@ -6206,6 +6221,10 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
6206
6221
|
ctx->transfer_ctx = nullptr;
|
6207
6222
|
}
|
6208
6223
|
|
6224
|
+
static bool ggml_vk_is_empty(ggml_tensor * node) {
|
6225
|
+
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
6226
|
+
}
|
6227
|
+
|
6209
6228
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
6210
6229
|
#ifdef GGML_VULKAN_DEBUG
|
6211
6230
|
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
@@ -6220,7 +6239,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
6220
6239
|
int last_node = cgraph->n_nodes - 1;
|
6221
6240
|
|
6222
6241
|
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
6223
|
-
while (last_node > 0 && (cgraph->nodes[last_node]
|
6242
|
+
while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
|
6224
6243
|
last_node -= 1;
|
6225
6244
|
}
|
6226
6245
|
|
@@ -6234,7 +6253,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
6234
6253
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
6235
6254
|
ggml_tensor * node = cgraph->nodes[i];
|
6236
6255
|
|
6237
|
-
if (
|
6256
|
+
if (ggml_vk_is_empty(node)) {
|
6238
6257
|
continue;
|
6239
6258
|
}
|
6240
6259
|
|
@@ -6536,7 +6555,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
|
|
6536
6555
|
for (int j = 0; j < level; j++) {
|
6537
6556
|
std::cerr << " ";
|
6538
6557
|
}
|
6539
|
-
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) <<
|
6558
|
+
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
|
6540
6559
|
|
6541
6560
|
done.push_back(tensor);
|
6542
6561
|
|
@@ -6548,7 +6567,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
|
|
6548
6567
|
}
|
6549
6568
|
|
6550
6569
|
static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
|
6551
|
-
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
6570
|
+
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) {
|
6552
6571
|
return;
|
6553
6572
|
}
|
6554
6573
|
i0 = std::max(i0, 5);
|
@@ -6569,6 +6588,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
6569
6588
|
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
6570
6589
|
} else if (tensor->type == GGML_TYPE_F16) {
|
6571
6590
|
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
6591
|
+
} else if (tensor->type == GGML_TYPE_I32) {
|
6592
|
+
val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
6572
6593
|
} else {
|
6573
6594
|
GGML_ASSERT(false);
|
6574
6595
|
}
|
@@ -6584,7 +6605,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
6584
6605
|
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
6585
6606
|
void * tensor_data = tensor->data;
|
6586
6607
|
|
6587
|
-
if (tensor->
|
6608
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
6588
6609
|
const size_t tensor_size = ggml_nbytes(tensor);
|
6589
6610
|
tensor_data = malloc(tensor_size);
|
6590
6611
|
|
@@ -6595,12 +6616,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
6595
6616
|
}
|
6596
6617
|
|
6597
6618
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
6598
|
-
std::cerr << "tensor=" << tensor << " tensor->
|
6619
|
+
std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
6599
6620
|
if (tensor->src[0] != nullptr) {
|
6600
|
-
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << "
|
6621
|
+
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
|
6601
6622
|
}
|
6602
6623
|
if (tensor->src[1] != nullptr) {
|
6603
|
-
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << "
|
6624
|
+
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
6604
6625
|
}
|
6605
6626
|
std::cerr << std::endl << "Result:" << std::endl;
|
6606
6627
|
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
@@ -6611,43 +6632,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
6611
6632
|
std::vector<const ggml_tensor *> done;
|
6612
6633
|
ggml_vk_print_graph_origin(tensor, done);
|
6613
6634
|
|
6614
|
-
if (tensor->
|
6635
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
6615
6636
|
free(tensor_data);
|
6616
6637
|
}
|
6617
6638
|
}
|
6618
6639
|
|
6619
|
-
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
6620
|
-
return;
|
6621
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
6622
|
-
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
6623
|
-
return;
|
6624
|
-
}
|
6625
|
-
for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
|
6626
|
-
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
6627
|
-
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
6628
|
-
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
6629
|
-
float val = 0.0f;
|
6630
|
-
if (tensor->type == GGML_TYPE_F32) {
|
6631
|
-
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
6632
|
-
} else if (tensor->type == GGML_TYPE_F16) {
|
6633
|
-
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
6634
|
-
}
|
6635
|
-
if (std::isnan(val)) {
|
6636
|
-
std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
|
6637
|
-
std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
6638
|
-
std::cerr << std::endl;
|
6639
|
-
ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
|
6640
|
-
std::cerr << std::endl;
|
6641
|
-
std::vector<const ggml_tensor *> done;
|
6642
|
-
ggml_vk_print_graph_origin(tensor, done);
|
6643
|
-
GGML_ASSERT(false);
|
6644
|
-
}
|
6645
|
-
}
|
6646
|
-
}
|
6647
|
-
}
|
6648
|
-
}
|
6649
|
-
}
|
6650
|
-
|
6651
6640
|
void * comp_result;
|
6652
6641
|
size_t comp_size;
|
6653
6642
|
size_t comp_nb[GGML_MAX_DIMS];
|
@@ -6701,10 +6690,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6701
6690
|
|
6702
6691
|
src0_buffer = malloc(src0_size);
|
6703
6692
|
src0_clone->data = src0_buffer;
|
6704
|
-
if (src0->
|
6693
|
+
if (ggml_backend_buffer_is_host(src0->buffer)) {
|
6705
6694
|
memcpy(src0_clone->data, src0->data, src0_size);
|
6706
6695
|
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6707
|
-
} else if (src0->
|
6696
|
+
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
6708
6697
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6709
6698
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6710
6699
|
uint64_t offset = extra->offset;
|
@@ -6735,8 +6724,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6735
6724
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6736
6725
|
ggml_vk_print_tensor(ctx, src0, "src0");
|
6737
6726
|
}
|
6738
|
-
|
6739
|
-
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
|
6740
6727
|
}
|
6741
6728
|
if (src1 != nullptr) {
|
6742
6729
|
src1_clone = ggml_dup_tensor(ggml_ctx, src1);
|
@@ -6745,10 +6732,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6745
6732
|
|
6746
6733
|
src1_buffer = malloc(src1_size);
|
6747
6734
|
src1_clone->data = src1_buffer;
|
6748
|
-
if (src1->
|
6735
|
+
if (ggml_backend_buffer_is_host(src1->buffer)) {
|
6749
6736
|
memcpy(src1_clone->data, src1->data, src1_size);
|
6750
6737
|
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6751
|
-
} else if (src1->
|
6738
|
+
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
6752
6739
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6753
6740
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6754
6741
|
uint64_t offset = extra->offset;
|
@@ -6779,12 +6766,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6779
6766
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6780
6767
|
ggml_vk_print_tensor(ctx, src1, "src1");
|
6781
6768
|
std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
|
6782
|
-
std::cerr << "src1_clone=" << tensor << " src1_clone->
|
6769
|
+
std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
|
6783
6770
|
if (src1->src[0] != nullptr) {
|
6784
|
-
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << "
|
6771
|
+
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
|
6785
6772
|
}
|
6786
6773
|
if (src1->src[1] != nullptr) {
|
6787
|
-
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << "
|
6774
|
+
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
|
6788
6775
|
}
|
6789
6776
|
std::cerr << std::endl << "Result:" << std::endl;
|
6790
6777
|
ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
|
@@ -6795,8 +6782,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6795
6782
|
std::vector<const ggml_tensor *> done;
|
6796
6783
|
ggml_vk_print_graph_origin(src1_clone, done);
|
6797
6784
|
}
|
6798
|
-
|
6799
|
-
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
|
6800
6785
|
}
|
6801
6786
|
if (src2 != nullptr) {
|
6802
6787
|
src2_clone = ggml_dup_tensor(ggml_ctx, src2);
|
@@ -6805,18 +6790,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6805
6790
|
|
6806
6791
|
src2_buffer = malloc(src2_size);
|
6807
6792
|
src2_clone->data = src2_buffer;
|
6808
|
-
if (src2->
|
6793
|
+
if (ggml_backend_buffer_is_host(src2->buffer)) {
|
6809
6794
|
memcpy(src2_clone->data, src2->data, src2_size);
|
6810
6795
|
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6811
|
-
} else if (src2->
|
6796
|
+
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
6812
6797
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
6813
|
-
vk_buffer
|
6798
|
+
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6814
6799
|
uint64_t offset = extra->offset;
|
6815
6800
|
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
6816
6801
|
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
6817
6802
|
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
6818
6803
|
const int idx = i3*src2->ne[2] + i2;
|
6819
|
-
ggml_vk_buffer_read(ctx,
|
6804
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
|
6820
6805
|
}
|
6821
6806
|
}
|
6822
6807
|
|
@@ -6826,10 +6811,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6826
6811
|
src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
|
6827
6812
|
}
|
6828
6813
|
} else {
|
6829
|
-
if (offset + src2_size >=
|
6830
|
-
src2_size =
|
6814
|
+
if (offset + src2_size >= buffer_gpu->size) {
|
6815
|
+
src2_size = buffer_gpu->size - offset;
|
6831
6816
|
}
|
6832
|
-
ggml_vk_buffer_read(ctx,
|
6817
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
|
6833
6818
|
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6834
6819
|
}
|
6835
6820
|
} else {
|
@@ -6839,12 +6824,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6839
6824
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6840
6825
|
ggml_vk_print_tensor(ctx, src2, "src2");
|
6841
6826
|
std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
|
6842
|
-
std::cerr << "src2_clone=" << tensor << " src2_clone->
|
6827
|
+
std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
|
6843
6828
|
if (src2->src[0] != nullptr) {
|
6844
|
-
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << "
|
6829
|
+
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
|
6845
6830
|
}
|
6846
6831
|
if (src2->src[1] != nullptr) {
|
6847
|
-
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << "
|
6832
|
+
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
|
6848
6833
|
}
|
6849
6834
|
std::cerr << std::endl << "Result:" << std::endl;
|
6850
6835
|
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
|
@@ -6855,8 +6840,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6855
6840
|
std::vector<const ggml_tensor *> done;
|
6856
6841
|
ggml_vk_print_graph_origin(src2_clone, done);
|
6857
6842
|
}
|
6858
|
-
|
6859
|
-
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src2", src2_clone);
|
6860
6843
|
}
|
6861
6844
|
|
6862
6845
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
@@ -6877,7 +6860,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6877
6860
|
tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
|
6878
6861
|
} else if (tensor->op == GGML_OP_SOFT_MAX) {
|
6879
6862
|
if (src1 != nullptr) {
|
6880
|
-
tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone,
|
6863
|
+
tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
6881
6864
|
} else {
|
6882
6865
|
tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
|
6883
6866
|
}
|
@@ -6894,7 +6877,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6894
6877
|
float attn_factor = ((float *) tensor->op_params)[8];
|
6895
6878
|
float beta_fast = ((float *) tensor->op_params)[9];
|
6896
6879
|
float beta_slow = ((float *) tensor->op_params)[10];
|
6897
|
-
tensor_clone =
|
6880
|
+
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
6898
6881
|
} else if (tensor->op == GGML_OP_UNARY) {
|
6899
6882
|
switch (ggml_get_unary_op(tensor)) {
|
6900
6883
|
case GGML_UNARY_OP_SILU:
|
@@ -6937,17 +6920,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6937
6920
|
GGML_ASSERT(false);
|
6938
6921
|
}
|
6939
6922
|
|
6940
|
-
// Disable vulkan here to avoid the hooks in ggml.c
|
6941
|
-
ctx->disable = true;
|
6942
|
-
|
6943
6923
|
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
6944
6924
|
ggml_build_forward_expand(cgraph, tensor_clone);
|
6945
6925
|
|
6946
6926
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
6947
6927
|
|
6948
|
-
ctx->disable = false;
|
6949
|
-
|
6950
|
-
ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
|
6951
6928
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6952
6929
|
ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
|
6953
6930
|
}
|
@@ -6964,9 +6941,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6964
6941
|
if (src1 != nullptr) {
|
6965
6942
|
free(src1_buffer);
|
6966
6943
|
}
|
6967
|
-
if (src2 != nullptr) {
|
6968
|
-
free(src2_buffer);
|
6969
|
-
}
|
6970
6944
|
|
6971
6945
|
ggml_free(ggml_ctx);
|
6972
6946
|
}
|
@@ -6991,7 +6965,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6991
6965
|
|
6992
6966
|
void * tensor_data = tensor->data;
|
6993
6967
|
|
6994
|
-
if (tensor->
|
6968
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
6995
6969
|
size_t tensor_size = ggml_nbytes(tensor);
|
6996
6970
|
tensor_data = malloc(tensor_size);
|
6997
6971
|
|
@@ -7026,8 +7000,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7026
7000
|
} else if (tensor->type == GGML_TYPE_F16) {
|
7027
7001
|
correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
|
7028
7002
|
result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
7003
|
+
} else if (tensor->type == GGML_TYPE_I32) {
|
7004
|
+
correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
|
7005
|
+
result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
7029
7006
|
} else {
|
7030
|
-
std::cerr << "
|
7007
|
+
std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl;
|
7031
7008
|
}
|
7032
7009
|
} else {
|
7033
7010
|
std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
|
@@ -7036,12 +7013,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7036
7013
|
|
7037
7014
|
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
|
7038
7015
|
std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
|
7039
|
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
7016
|
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
7040
7017
|
if (src0 != nullptr) {
|
7041
|
-
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
7018
|
+
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
7042
7019
|
}
|
7043
7020
|
if (src1 != nullptr) {
|
7044
|
-
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
7021
|
+
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
7045
7022
|
}
|
7046
7023
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
7047
7024
|
std::cerr << std::endl << "Result:" << std::endl;
|
@@ -7077,12 +7054,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7077
7054
|
|
7078
7055
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
7079
7056
|
std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
7080
|
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
7057
|
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
7081
7058
|
if (src0 != nullptr) {
|
7082
|
-
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
7059
|
+
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
7083
7060
|
}
|
7084
7061
|
if (src1 != nullptr) {
|
7085
|
-
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
7062
|
+
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
7086
7063
|
}
|
7087
7064
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
7088
7065
|
std::cerr << std::endl << "Result:" << std::endl;
|
@@ -7101,12 +7078,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7101
7078
|
|
7102
7079
|
if (avg_err > 0.05 || std::isnan(avg_err)) {
|
7103
7080
|
std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
7104
|
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
7081
|
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
7105
7082
|
if (src0 != nullptr) {
|
7106
|
-
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
7083
|
+
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
7107
7084
|
}
|
7108
7085
|
if (src1 != nullptr) {
|
7109
|
-
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
7086
|
+
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
7110
7087
|
}
|
7111
7088
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
7112
7089
|
std::cerr << std::endl << "Result:" << std::endl;
|
@@ -7118,14 +7095,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7118
7095
|
ggml_vk_print_graph_origin(tensor, done);
|
7119
7096
|
GGML_ASSERT(false);
|
7120
7097
|
} else {
|
7121
|
-
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << "
|
7098
|
+
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
|
7122
7099
|
}
|
7123
7100
|
|
7124
7101
|
free(comp_result);
|
7125
7102
|
comp_result = nullptr;
|
7126
7103
|
comp_size = 0;
|
7127
7104
|
|
7128
|
-
if (tensor->
|
7105
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
7129
7106
|
free(tensor_data);
|
7130
7107
|
}
|
7131
7108
|
}
|