llama_cpp 0.15.1 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
@@ -114,6 +114,7 @@ struct vk_device {
|
|
114
114
|
size_t idx;
|
115
115
|
|
116
116
|
vk_matmul_pipeline pipeline_matmul_f32;
|
117
|
+
vk_matmul_pipeline pipeline_matmul_f32_f16;
|
117
118
|
vk_matmul_pipeline pipeline_matmul_f16;
|
118
119
|
vk_matmul_pipeline pipeline_matmul_f16_f32;
|
119
120
|
vk_pipeline pipeline_matmul_split_k_reduce;
|
@@ -289,12 +290,12 @@ struct vk_op_rope_neox_push_constants {
|
|
289
290
|
float corr_dims[4];
|
290
291
|
float theta_scale;
|
291
292
|
float inv_ndims;
|
293
|
+
uint32_t has_freq_facs;
|
292
294
|
};
|
293
295
|
|
294
296
|
struct vk_op_soft_max_push_constants {
|
295
297
|
uint32_t KX;
|
296
298
|
uint32_t KY;
|
297
|
-
uint32_t KZ;
|
298
299
|
float scale;
|
299
300
|
float max_bias;
|
300
301
|
float m0;
|
@@ -304,7 +305,8 @@ struct vk_op_soft_max_push_constants {
|
|
304
305
|
|
305
306
|
struct vk_op_argsort_push_constants {
|
306
307
|
uint32_t ncols;
|
307
|
-
|
308
|
+
uint32_t ncols_pad;
|
309
|
+
int32_t order;
|
308
310
|
};
|
309
311
|
|
310
312
|
// Allow pre-recording command buffers
|
@@ -375,13 +377,12 @@ struct ggml_backend_vk_context {
|
|
375
377
|
vk_context * compute_ctx;
|
376
378
|
vk_context * transfer_ctx;
|
377
379
|
|
378
|
-
bool disable;
|
379
380
|
bool initialized;
|
380
381
|
|
381
382
|
size_t idx;
|
382
383
|
};
|
383
384
|
|
384
|
-
struct
|
385
|
+
struct vk_instance_t {
|
385
386
|
vk::Instance instance;
|
386
387
|
|
387
388
|
std::vector<size_t> device_indices;
|
@@ -423,7 +424,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
423
424
|
typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
424
425
|
|
425
426
|
static bool vk_instance_initialized = false;
|
426
|
-
static
|
427
|
+
static vk_instance_t vk_instance;
|
427
428
|
|
428
429
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
429
430
|
|
@@ -1013,6 +1014,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1013
1014
|
uint32_t s_align = 32;
|
1014
1015
|
|
1015
1016
|
ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
1017
|
+
ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
1016
1018
|
ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
1017
1019
|
ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
1018
1020
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
@@ -1048,6 +1050,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1048
1050
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1049
1051
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1050
1052
|
|
1053
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1054
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1055
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1056
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1057
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1058
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1059
|
+
|
1051
1060
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1052
1061
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1053
1062
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
@@ -1230,6 +1239,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1230
1239
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1231
1240
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1232
1241
|
|
1242
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1243
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1244
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1245
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1246
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1247
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1248
|
+
|
1233
1249
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1234
1250
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1235
1251
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
@@ -1501,14 +1517,14 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1501
1517
|
|
1502
1518
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
|
1503
1519
|
|
1504
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main",
|
1505
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main",
|
1520
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1521
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1506
1522
|
|
1507
1523
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1508
1524
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1509
1525
|
|
1510
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main",
|
1511
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main",
|
1526
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
1527
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
1512
1528
|
|
1513
1529
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
1514
1530
|
}
|
@@ -1859,7 +1875,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1859
1875
|
ctx->compute_ctx = nullptr;
|
1860
1876
|
ctx->transfer_ctx = nullptr;
|
1861
1877
|
|
1862
|
-
ctx->disable = false;
|
1863
1878
|
ctx->initialized = true;
|
1864
1879
|
|
1865
1880
|
ctx->idx = idx;
|
@@ -1903,6 +1918,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
1903
1918
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
1904
1919
|
return ctx->device->pipeline_matmul_f32;
|
1905
1920
|
}
|
1921
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
1922
|
+
return ctx->device->pipeline_matmul_f32_f16;
|
1923
|
+
}
|
1906
1924
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
1907
1925
|
return ctx->device->pipeline_matmul_f16_f32;
|
1908
1926
|
}
|
@@ -2722,7 +2740,7 @@ static void ggml_vk_matmul(
|
|
2722
2740
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
2723
2741
|
uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
|
2724
2742
|
#ifdef GGML_VULKAN_DEBUG
|
2725
|
-
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
2743
|
+
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
2726
2744
|
#endif
|
2727
2745
|
ggml_vk_sync_buffers(subctx);
|
2728
2746
|
if (split_k == 1) {
|
@@ -2792,7 +2810,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
|
|
2792
2810
|
|
2793
2811
|
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
2794
2812
|
#ifdef GGML_VULKAN_DEBUG
|
2795
|
-
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ",
|
2813
|
+
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
2796
2814
|
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
|
2797
2815
|
#endif
|
2798
2816
|
const int tensor_type_size = ggml_type_size(tensor->type);
|
@@ -2812,9 +2830,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
|
|
2812
2830
|
|
2813
2831
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2814
2832
|
#ifdef GGML_VULKAN_DEBUG
|
2815
|
-
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
2816
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
2817
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
2833
|
+
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
2834
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
2835
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
2818
2836
|
#endif
|
2819
2837
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
2820
2838
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
@@ -2982,19 +3000,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2982
3000
|
ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
|
2983
3001
|
0, 0, 0, 0, 1
|
2984
3002
|
); // NOLINT
|
2985
|
-
|
2986
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2987
|
-
// copy dst to host
|
2988
|
-
float * d = (float *) ((char *) dst->data);
|
2989
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
2990
|
-
}
|
2991
3003
|
}
|
2992
3004
|
|
2993
3005
|
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2994
3006
|
#ifdef GGML_VULKAN_DEBUG
|
2995
|
-
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
2996
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
2997
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3007
|
+
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3008
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3009
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
2998
3010
|
#endif
|
2999
3011
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
3000
3012
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
@@ -3147,12 +3159,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3147
3159
|
|
3148
3160
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3149
3161
|
#ifdef GGML_VULKAN_DEBUG
|
3150
|
-
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
3151
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
3152
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3162
|
+
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3163
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3164
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
3153
3165
|
#endif
|
3154
3166
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
3155
|
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
3156
3167
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
3157
3168
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
3158
3169
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -3217,25 +3228,17 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
3217
3228
|
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
3218
3229
|
ggml_vk_sync_buffers(subctx);
|
3219
3230
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
3220
|
-
|
3221
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3222
|
-
// copy dst to host
|
3223
|
-
float * d = (float *) dst->data;
|
3224
|
-
ggml_vk_sync_buffers(subctx);
|
3225
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
|
3226
|
-
}
|
3227
3231
|
}
|
3228
3232
|
|
3229
3233
|
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3230
3234
|
#ifdef GGML_VULKAN_DEBUG
|
3231
|
-
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
3232
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
3233
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3235
|
+
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3236
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3237
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
3234
3238
|
#endif
|
3235
3239
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
3236
3240
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
3237
3241
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
3238
|
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
3239
3242
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
3240
3243
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
3241
3244
|
|
@@ -3302,26 +3305,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
3302
3305
|
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
3303
3306
|
ggml_vk_sync_buffers(subctx);
|
3304
3307
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
3305
|
-
|
3306
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3307
|
-
// copy dst to host
|
3308
|
-
float * d = (float *) dst->data;
|
3309
|
-
ggml_vk_sync_buffers(subctx);
|
3310
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
|
3311
|
-
}
|
3312
|
-
}
|
3313
|
-
|
3314
|
-
static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
|
3315
|
-
const uint64_t ne10 = src1->ne[0];
|
3316
|
-
|
3317
|
-
const uint64_t ne0 = dst->ne[0];
|
3318
|
-
const uint64_t ne1 = dst->ne[1];
|
3319
|
-
|
3320
|
-
// TODO: find the optimal values for these
|
3321
|
-
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
3322
|
-
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
3323
|
-
dst->type == GGML_TYPE_F32 &&
|
3324
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
|
3325
3308
|
}
|
3326
3309
|
|
3327
3310
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3711,8 +3694,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
3711
3694
|
// TODO: support for transposed / permuted tensors
|
3712
3695
|
GGML_ASSERT(nb0 == sizeof(float));
|
3713
3696
|
GGML_ASSERT(nb00 == sizeof(float));
|
3714
|
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
3715
|
-
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
3716
3697
|
|
3717
3698
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
3718
3699
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
@@ -3830,12 +3811,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
3830
3811
|
return nullptr;
|
3831
3812
|
case GGML_OP_SOFT_MAX:
|
3832
3813
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
|
3833
|
-
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32 || src2->type == GGML_TYPE_F16);
|
3834
3814
|
|
3835
|
-
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) &&
|
3815
|
+
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
3836
3816
|
return ctx->device->pipeline_soft_max_f32;
|
3837
3817
|
}
|
3838
|
-
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 &&
|
3818
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
3839
3819
|
return ctx->device->pipeline_soft_max_f32_f16;
|
3840
3820
|
}
|
3841
3821
|
return nullptr;
|
@@ -3874,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
3874
3854
|
default:
|
3875
3855
|
return nullptr;
|
3876
3856
|
}
|
3857
|
+
|
3858
|
+
GGML_UNUSED(src2);
|
3877
3859
|
}
|
3878
3860
|
|
3879
3861
|
static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
@@ -3903,14 +3885,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
3903
3885
|
template<typename PC>
|
3904
3886
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
3905
3887
|
#ifdef GGML_VULKAN_DEBUG
|
3906
|
-
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
3888
|
+
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3907
3889
|
if (src1 != nullptr) {
|
3908
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
3890
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3909
3891
|
}
|
3910
3892
|
if (src2 != nullptr) {
|
3911
|
-
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ",
|
3893
|
+
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
3912
3894
|
}
|
3913
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3895
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
3914
3896
|
#endif
|
3915
3897
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
3916
3898
|
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
@@ -3920,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3920
3902
|
const uint64_t ne02 = src0->ne[2];
|
3921
3903
|
const uint64_t ne03 = src0->ne[3];
|
3922
3904
|
const uint64_t ne0 = ne00 * ne01;
|
3905
|
+
|
3923
3906
|
const bool use_src1 = src1 != nullptr;
|
3924
3907
|
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
3925
3908
|
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
@@ -3927,11 +3910,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3927
3910
|
const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
|
3928
3911
|
const uint64_t ne1 = ne10 * ne11;
|
3929
3912
|
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
3930
|
-
const uint64_t nb2 = dst->nb[2];
|
3931
|
-
const uint64_t nb3 = dst->nb[3];
|
3932
3913
|
|
3933
3914
|
const bool use_src2 = src2 != nullptr;
|
3934
|
-
const uint64_t
|
3915
|
+
const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
|
3916
|
+
const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
|
3917
|
+
const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
|
3918
|
+
const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
|
3919
|
+
const uint64_t ne2 = ne20 * ne21;
|
3935
3920
|
|
3936
3921
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
3937
3922
|
ggml_vk_func_t op_func;
|
@@ -3977,7 +3962,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3977
3962
|
src1_uma = d_Y != nullptr;
|
3978
3963
|
}
|
3979
3964
|
if (use_src2) {
|
3980
|
-
ggml_vk_host_get(ctx,
|
3965
|
+
ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
|
3981
3966
|
src2_uma = d_Z != nullptr;
|
3982
3967
|
}
|
3983
3968
|
}
|
@@ -3990,7 +3975,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3990
3975
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3991
3976
|
|
3992
3977
|
// Workaround for tiny tensor inputs on ROPE
|
3993
|
-
if (use_src1 &&
|
3978
|
+
if (use_src1 && y_sz > d_D->size) {
|
3994
3979
|
y_sz = VK_WHOLE_SIZE;
|
3995
3980
|
}
|
3996
3981
|
|
@@ -4007,7 +3992,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4007
3992
|
y_buf_offset = extra_src1->offset;
|
4008
3993
|
GGML_ASSERT(d_Y != nullptr);
|
4009
3994
|
}
|
4010
|
-
|
4011
3995
|
if (use_src2 && !src2_uma) {
|
4012
3996
|
d_Z = extra_src2->buffer_gpu.lock();
|
4013
3997
|
z_buf_offset = extra_src2->offset;
|
@@ -4017,6 +4001,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4017
4001
|
if (op_supports_incontiguous) {
|
4018
4002
|
x_sz = ggml_nbytes(src0);
|
4019
4003
|
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
4004
|
+
z_sz = use_src2 ? ggml_nbytes(src2) : 0;
|
4020
4005
|
d_sz = ggml_nbytes(dst);
|
4021
4006
|
|
4022
4007
|
if (x_buf_offset + x_sz >= d_X->size) {
|
@@ -4025,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4025
4010
|
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
4026
4011
|
y_sz = VK_WHOLE_SIZE;
|
4027
4012
|
}
|
4013
|
+
if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
|
4014
|
+
z_sz = VK_WHOLE_SIZE;
|
4015
|
+
}
|
4028
4016
|
if (d_buf_offset + d_sz >= d_D->size) {
|
4029
4017
|
d_sz = VK_WHOLE_SIZE;
|
4030
4018
|
}
|
@@ -4047,7 +4035,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4047
4035
|
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
4048
4036
|
break;
|
4049
4037
|
case GGML_OP_GET_ROWS:
|
4050
|
-
elements = {
|
4038
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
4039
|
+
break;
|
4040
|
+
case GGML_OP_ARGSORT:
|
4041
|
+
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
|
4051
4042
|
break;
|
4052
4043
|
default:
|
4053
4044
|
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
@@ -4061,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4061
4052
|
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
4062
4053
|
y_sz *= ne12 * ne13;
|
4063
4054
|
}
|
4055
|
+
if (use_src2 && z_sz != VK_WHOLE_SIZE) {
|
4056
|
+
z_sz *= ne22 * ne23;
|
4057
|
+
}
|
4064
4058
|
if (d_sz != VK_WHOLE_SIZE) {
|
4065
4059
|
d_sz *= ne02 * ne03;
|
4066
4060
|
}
|
4067
4061
|
}
|
4068
4062
|
|
4069
4063
|
if (op == GGML_OP_SOFT_MAX) {
|
4070
|
-
// Empty src1
|
4064
|
+
// Empty src1 is possible in soft_max, but the shader needs a buffer
|
4071
4065
|
vk_subbuffer subbuf_y;
|
4072
4066
|
if (use_src1) {
|
4073
4067
|
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
@@ -4075,15 +4069,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4075
4069
|
subbuf_y = { d_X, 0, d_X->size };
|
4076
4070
|
}
|
4077
4071
|
|
4078
|
-
|
4079
|
-
|
4080
|
-
|
4072
|
+
ggml_vk_sync_buffers(subctx);
|
4073
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4074
|
+
} else if (op == GGML_OP_ROPE) {
|
4075
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
4076
|
+
const bool is_neox = mode & 2;
|
4077
|
+
|
4078
|
+
if (is_neox) {
|
4079
|
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
4080
|
+
vk_subbuffer subbuf_z;
|
4081
|
+
if (use_src2) {
|
4082
|
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
4083
|
+
} else {
|
4084
|
+
subbuf_z = { d_X, 0, d_X->size };
|
4085
|
+
}
|
4086
|
+
|
4087
|
+
ggml_vk_sync_buffers(subctx);
|
4088
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4081
4089
|
} else {
|
4082
|
-
|
4090
|
+
ggml_vk_sync_buffers(subctx);
|
4091
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4083
4092
|
}
|
4084
|
-
|
4093
|
+
} else if (use_src2) {
|
4085
4094
|
ggml_vk_sync_buffers(subctx);
|
4086
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz },
|
4095
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4087
4096
|
} else if (use_src1) {
|
4088
4097
|
ggml_vk_sync_buffers(subctx);
|
4089
4098
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
@@ -4091,22 +4100,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4091
4100
|
ggml_vk_sync_buffers(subctx);
|
4092
4101
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4093
4102
|
}
|
4094
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
|
4095
|
-
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
4096
|
-
} else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
|
4097
|
-
// copy dst to host
|
4098
|
-
float * d = (float *) dst->data;
|
4099
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
4100
|
-
}
|
4101
4103
|
} else {
|
4102
4104
|
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
4105
|
+
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
4106
|
+
GGML_ASSERT(!use_src2);
|
4103
4107
|
|
4104
4108
|
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
|
4105
4109
|
|
4106
4110
|
switch (dst->op) {
|
4107
4111
|
case GGML_OP_NORM:
|
4108
4112
|
case GGML_OP_RMS_NORM:
|
4109
|
-
case GGML_OP_SOFT_MAX:
|
4110
4113
|
elements = { (uint32_t)ne01, 1, 1 };
|
4111
4114
|
break;
|
4112
4115
|
case GGML_OP_DIAG_MASK_INF:
|
@@ -4136,10 +4139,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4136
4139
|
ggml_vk_sync_buffers(subctx);
|
4137
4140
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4138
4141
|
}
|
4139
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
4140
|
-
// copy dst to host
|
4141
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
4142
|
-
}
|
4143
4142
|
}
|
4144
4143
|
}
|
4145
4144
|
}
|
@@ -4270,7 +4269,7 @@ static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * su
|
|
4270
4269
|
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
4271
4270
|
}
|
4272
4271
|
|
4273
|
-
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1,
|
4272
|
+
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
4274
4273
|
float * op_params = (float *)dst->op_params;
|
4275
4274
|
|
4276
4275
|
float scale = op_params[0];
|
@@ -4286,17 +4285,16 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
4286
4285
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
4287
4286
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
4288
4287
|
|
4289
|
-
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1,
|
4288
|
+
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
|
4290
4289
|
ncols,
|
4291
4290
|
src1 != nullptr ? nrows_y : (uint32_t)0,
|
4292
|
-
src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
|
4293
4291
|
scale, max_bias,
|
4294
4292
|
m0, m1,
|
4295
4293
|
n_head_log2,
|
4296
4294
|
});
|
4297
4295
|
}
|
4298
4296
|
|
4299
|
-
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
4297
|
+
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
4300
4298
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
4301
4299
|
const int mode = ((int32_t *) dst->op_params)[2];
|
4302
4300
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
@@ -4319,15 +4317,40 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
4319
4317
|
if (is_neox) {
|
4320
4318
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
4321
4319
|
const float inv_ndims = -1.0f / n_dims;
|
4322
|
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1,
|
4320
|
+
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
4321
|
+
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
4322
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
|
4323
|
+
src2 != nullptr,
|
4324
|
+
});
|
4323
4325
|
} else {
|
4324
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1,
|
4326
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
4327
|
+
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
4328
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
4329
|
+
});
|
4325
4330
|
}
|
4326
4331
|
}
|
4327
4332
|
|
4328
4333
|
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
4329
4334
|
int32_t * op_params = (int32_t *)dst->op_params;
|
4330
|
-
|
4335
|
+
|
4336
|
+
uint32_t ncols = src0->ne[0];
|
4337
|
+
|
4338
|
+
uint32_t ncols_pad = 1;
|
4339
|
+
while (ncols_pad < ncols) {
|
4340
|
+
ncols_pad *= 2;
|
4341
|
+
}
|
4342
|
+
|
4343
|
+
GGML_ASSERT(ncols_pad <= 1024);
|
4344
|
+
|
4345
|
+
std::cerr << "ncols=" << ncols << " ncols_pad=" << ncols_pad << " ascending=" << op_params[0] << std::endl;
|
4346
|
+
|
4347
|
+
std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
|
4348
|
+
|
4349
|
+
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
|
4350
|
+
ncols,
|
4351
|
+
ncols_pad,
|
4352
|
+
op_params[0],
|
4353
|
+
});
|
4331
4354
|
}
|
4332
4355
|
|
4333
4356
|
#ifdef GGML_VULKAN_RUN_TESTS
|
@@ -4379,6 +4402,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4379
4402
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4380
4403
|
p = ctx->device->pipeline_matmul_f32->a_s;
|
4381
4404
|
shname = "F32_ALIGNED_S";
|
4405
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4406
|
+
p = ctx->device->pipeline_matmul_f32_f16->a_s;
|
4407
|
+
shname = "F32_F16_ALIGNED_S";
|
4382
4408
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4383
4409
|
p = ctx->device->pipeline_matmul_f16_f32->a_s;
|
4384
4410
|
shname = "F16_F32_ALIGNED_S";
|
@@ -4392,6 +4418,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4392
4418
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4393
4419
|
p = ctx->device->pipeline_matmul_f32->a_m;
|
4394
4420
|
shname = "F32_ALIGNED_M";
|
4421
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4422
|
+
p = ctx->device->pipeline_matmul_f32_f16->a_m;
|
4423
|
+
shname = "F32_F16_ALIGNED_M";
|
4395
4424
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4396
4425
|
p = ctx->device->pipeline_matmul_f16_f32->a_m;
|
4397
4426
|
shname = "F16_F32_ALIGNED_M";
|
@@ -4405,6 +4434,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4405
4434
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4406
4435
|
p = ctx->device->pipeline_matmul_f32->a_l;
|
4407
4436
|
shname = "F32_ALIGNED_L";
|
4437
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4438
|
+
p = ctx->device->pipeline_matmul_f32_f16->a_l;
|
4439
|
+
shname = "F32_F16_ALIGNED_L";
|
4408
4440
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4409
4441
|
p = ctx->device->pipeline_matmul_f16_f32->a_l;
|
4410
4442
|
shname = "F16_F32_ALIGNED_L";
|
@@ -4425,6 +4457,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4425
4457
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4426
4458
|
p = ctx->device->pipeline_matmul_f32->s;
|
4427
4459
|
shname = "F32_S";
|
4460
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4461
|
+
p = ctx->device->pipeline_matmul_f32_f16->s;
|
4462
|
+
shname = "F32_F16_S";
|
4428
4463
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4429
4464
|
p = ctx->device->pipeline_matmul_f16_f32->s;
|
4430
4465
|
shname = "F16_F32_S";
|
@@ -4436,6 +4471,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4436
4471
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4437
4472
|
p = ctx->device->pipeline_matmul_f32->m;
|
4438
4473
|
shname = "F32_M";
|
4474
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4475
|
+
p = ctx->device->pipeline_matmul_f32_f16->m;
|
4476
|
+
shname = "F32_F16_M";
|
4439
4477
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4440
4478
|
p = ctx->device->pipeline_matmul_f16_f32->m;
|
4441
4479
|
shname = "F16_F32_M";
|
@@ -4447,6 +4485,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4447
4485
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4448
4486
|
p = ctx->device->pipeline_matmul_f32->l;
|
4449
4487
|
shname = "F32_L";
|
4488
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4489
|
+
p = ctx->device->pipeline_matmul_f32_f16->l;
|
4490
|
+
shname = "F32_F16_L";
|
4450
4491
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4451
4492
|
p = ctx->device->pipeline_matmul_f16_f32->l;
|
4452
4493
|
shname = "F16_F32_L";
|
@@ -4559,15 +4600,11 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4559
4600
|
src1_ggml->data = y;
|
4560
4601
|
tensor_ggml->data = d_chk;
|
4561
4602
|
|
4562
|
-
ctx->disable = true;
|
4563
|
-
|
4564
4603
|
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
4565
4604
|
ggml_build_forward_expand(cgraph, tensor_ggml);
|
4566
4605
|
|
4567
4606
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
4568
4607
|
|
4569
|
-
ctx->disable = false;
|
4570
|
-
|
4571
4608
|
ggml_free(ggml_ctx);
|
4572
4609
|
|
4573
4610
|
double avg_err = 0.0;
|
@@ -5047,15 +5084,11 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
5047
5084
|
src1_ggml->data = y;
|
5048
5085
|
tensor_ggml->data = d_chk;
|
5049
5086
|
|
5050
|
-
ctx->disable = true;
|
5051
|
-
|
5052
5087
|
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
5053
5088
|
ggml_build_forward_expand(cgraph, tensor_ggml);
|
5054
5089
|
|
5055
5090
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
5056
5091
|
|
5057
|
-
ctx->disable = false;
|
5058
|
-
|
5059
5092
|
ggml_free(ggml_ctx);
|
5060
5093
|
|
5061
5094
|
double avg_err = 0.0;
|
@@ -5132,12 +5165,12 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5132
5165
|
#ifdef GGML_VULKAN_DEBUG
|
5133
5166
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
5134
5167
|
#endif
|
5135
|
-
|
5168
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5169
|
+
|
5170
|
+
if (extra == nullptr) {
|
5136
5171
|
return;
|
5137
5172
|
}
|
5138
5173
|
|
5139
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5140
|
-
|
5141
5174
|
ggml_tensor * src0 = node->src[0];
|
5142
5175
|
ggml_tensor * src1 = node->src[1];
|
5143
5176
|
|
@@ -5242,9 +5275,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5242
5275
|
}
|
5243
5276
|
|
5244
5277
|
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
5245
|
-
if (ctx->disable) {
|
5246
|
-
return;
|
5247
|
-
}
|
5248
5278
|
#ifdef GGML_VULKAN_DEBUG
|
5249
5279
|
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
5250
5280
|
#endif
|
@@ -5418,7 +5448,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5418
5448
|
}
|
5419
5449
|
|
5420
5450
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
5421
|
-
|
5451
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5452
|
+
|
5453
|
+
if (ggml_is_empty(node) || extra == nullptr) {
|
5422
5454
|
return;
|
5423
5455
|
}
|
5424
5456
|
|
@@ -5432,8 +5464,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5432
5464
|
const ggml_tensor * src1 = node->src[1];
|
5433
5465
|
const ggml_tensor * src2 = node->src[2];
|
5434
5466
|
|
5435
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5436
|
-
|
5437
5467
|
switch (node->op) {
|
5438
5468
|
case GGML_OP_UNARY:
|
5439
5469
|
switch (ggml_get_unary_op(node)) {
|
@@ -5545,11 +5575,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5545
5575
|
|
5546
5576
|
break;
|
5547
5577
|
case GGML_OP_SOFT_MAX:
|
5548
|
-
ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1,
|
5578
|
+
ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
|
5549
5579
|
|
5550
5580
|
break;
|
5551
5581
|
case GGML_OP_ROPE:
|
5552
|
-
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
|
5582
|
+
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
|
5553
5583
|
|
5554
5584
|
break;
|
5555
5585
|
case GGML_OP_ARGSORT:
|
@@ -5578,7 +5608,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5578
5608
|
last_node = true;
|
5579
5609
|
#endif
|
5580
5610
|
|
5581
|
-
if (
|
5611
|
+
if (last_node) {
|
5582
5612
|
ggml_vk_ctx_end(ctx->compute_ctx);
|
5583
5613
|
ctx->compute_ctx->exit_tensor = node;
|
5584
5614
|
ctx->compute_ctx = nullptr;
|
@@ -5586,10 +5616,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5586
5616
|
}
|
5587
5617
|
|
5588
5618
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
5589
|
-
if (ctx->disable) {
|
5590
|
-
return false;
|
5591
|
-
}
|
5592
|
-
|
5593
5619
|
ggml_tensor_extra_gpu * extra = nullptr;
|
5594
5620
|
|
5595
5621
|
switch (tensor->op) {
|
@@ -5648,7 +5674,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5648
5674
|
}
|
5649
5675
|
|
5650
5676
|
#ifdef GGML_VULKAN_DEBUG
|
5651
|
-
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ",
|
5677
|
+
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
|
5652
5678
|
#endif
|
5653
5679
|
|
5654
5680
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
@@ -5688,9 +5714,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5688
5714
|
|
5689
5715
|
// Clean up after graph processing is done
|
5690
5716
|
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
5691
|
-
if (ctx->disable) {
|
5692
|
-
return;
|
5693
|
-
}
|
5694
5717
|
#ifdef GGML_VULKAN_DEBUG
|
5695
5718
|
std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
|
5696
5719
|
#endif
|
@@ -5863,7 +5886,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
5863
5886
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
5864
5887
|
}
|
5865
5888
|
|
5866
|
-
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
5867
5889
|
tensor->extra = extra;
|
5868
5890
|
}
|
5869
5891
|
|
@@ -5871,8 +5893,6 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
5871
5893
|
#ifdef GGML_VULKAN_DEBUG
|
5872
5894
|
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
5873
5895
|
#endif
|
5874
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
5875
|
-
|
5876
5896
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
5877
5897
|
|
5878
5898
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -5886,8 +5906,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
5886
5906
|
#ifdef GGML_VULKAN_DEBUG
|
5887
5907
|
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
5888
5908
|
#endif
|
5889
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
5890
|
-
|
5891
5909
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
5892
5910
|
|
5893
5911
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -6030,6 +6048,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
|
|
6030
6048
|
#ifdef GGML_VULKAN_DEBUG
|
6031
6049
|
std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
6032
6050
|
#endif
|
6051
|
+
size += 32; // Behave like the CPU buffer type
|
6033
6052
|
void * ptr = nullptr;
|
6034
6053
|
try {
|
6035
6054
|
ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
|
@@ -6117,7 +6136,6 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
6117
6136
|
#endif
|
6118
6137
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6119
6138
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6120
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
6121
6139
|
|
6122
6140
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6123
6141
|
|
@@ -6138,7 +6156,6 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
6138
6156
|
#endif
|
6139
6157
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6140
6158
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6141
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
6142
6159
|
|
6143
6160
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6144
6161
|
|
@@ -6204,6 +6221,10 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
6204
6221
|
ctx->transfer_ctx = nullptr;
|
6205
6222
|
}
|
6206
6223
|
|
6224
|
+
static bool ggml_vk_is_empty(ggml_tensor * node) {
|
6225
|
+
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
6226
|
+
}
|
6227
|
+
|
6207
6228
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
6208
6229
|
#ifdef GGML_VULKAN_DEBUG
|
6209
6230
|
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
@@ -6218,7 +6239,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
6218
6239
|
int last_node = cgraph->n_nodes - 1;
|
6219
6240
|
|
6220
6241
|
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
6221
|
-
while (last_node > 0 && (cgraph->nodes[last_node]
|
6242
|
+
while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
|
6222
6243
|
last_node -= 1;
|
6223
6244
|
}
|
6224
6245
|
|
@@ -6232,7 +6253,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
6232
6253
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
6233
6254
|
ggml_tensor * node = cgraph->nodes[i];
|
6234
6255
|
|
6235
|
-
if (
|
6256
|
+
if (ggml_vk_is_empty(node)) {
|
6236
6257
|
continue;
|
6237
6258
|
}
|
6238
6259
|
|
@@ -6534,7 +6555,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
|
|
6534
6555
|
for (int j = 0; j < level; j++) {
|
6535
6556
|
std::cerr << " ";
|
6536
6557
|
}
|
6537
|
-
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) <<
|
6558
|
+
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
|
6538
6559
|
|
6539
6560
|
done.push_back(tensor);
|
6540
6561
|
|
@@ -6546,7 +6567,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
|
|
6546
6567
|
}
|
6547
6568
|
|
6548
6569
|
static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
|
6549
|
-
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
6570
|
+
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) {
|
6550
6571
|
return;
|
6551
6572
|
}
|
6552
6573
|
i0 = std::max(i0, 5);
|
@@ -6567,6 +6588,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
6567
6588
|
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
6568
6589
|
} else if (tensor->type == GGML_TYPE_F16) {
|
6569
6590
|
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
6591
|
+
} else if (tensor->type == GGML_TYPE_I32) {
|
6592
|
+
val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
6570
6593
|
} else {
|
6571
6594
|
GGML_ASSERT(false);
|
6572
6595
|
}
|
@@ -6582,7 +6605,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
6582
6605
|
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
6583
6606
|
void * tensor_data = tensor->data;
|
6584
6607
|
|
6585
|
-
if (tensor->
|
6608
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
6586
6609
|
const size_t tensor_size = ggml_nbytes(tensor);
|
6587
6610
|
tensor_data = malloc(tensor_size);
|
6588
6611
|
|
@@ -6593,12 +6616,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
6593
6616
|
}
|
6594
6617
|
|
6595
6618
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
6596
|
-
std::cerr << "tensor=" << tensor << " tensor->
|
6619
|
+
std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
6597
6620
|
if (tensor->src[0] != nullptr) {
|
6598
|
-
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << "
|
6621
|
+
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
|
6599
6622
|
}
|
6600
6623
|
if (tensor->src[1] != nullptr) {
|
6601
|
-
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << "
|
6624
|
+
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
6602
6625
|
}
|
6603
6626
|
std::cerr << std::endl << "Result:" << std::endl;
|
6604
6627
|
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
@@ -6609,43 +6632,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
6609
6632
|
std::vector<const ggml_tensor *> done;
|
6610
6633
|
ggml_vk_print_graph_origin(tensor, done);
|
6611
6634
|
|
6612
|
-
if (tensor->
|
6635
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
6613
6636
|
free(tensor_data);
|
6614
6637
|
}
|
6615
6638
|
}
|
6616
6639
|
|
6617
|
-
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
6618
|
-
return;
|
6619
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
6620
|
-
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
6621
|
-
return;
|
6622
|
-
}
|
6623
|
-
for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
|
6624
|
-
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
6625
|
-
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
6626
|
-
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
6627
|
-
float val = 0.0f;
|
6628
|
-
if (tensor->type == GGML_TYPE_F32) {
|
6629
|
-
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
6630
|
-
} else if (tensor->type == GGML_TYPE_F16) {
|
6631
|
-
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
6632
|
-
}
|
6633
|
-
if (std::isnan(val)) {
|
6634
|
-
std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
|
6635
|
-
std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
6636
|
-
std::cerr << std::endl;
|
6637
|
-
ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
|
6638
|
-
std::cerr << std::endl;
|
6639
|
-
std::vector<const ggml_tensor *> done;
|
6640
|
-
ggml_vk_print_graph_origin(tensor, done);
|
6641
|
-
GGML_ASSERT(false);
|
6642
|
-
}
|
6643
|
-
}
|
6644
|
-
}
|
6645
|
-
}
|
6646
|
-
}
|
6647
|
-
}
|
6648
|
-
|
6649
6640
|
void * comp_result;
|
6650
6641
|
size_t comp_size;
|
6651
6642
|
size_t comp_nb[GGML_MAX_DIMS];
|
@@ -6699,10 +6690,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6699
6690
|
|
6700
6691
|
src0_buffer = malloc(src0_size);
|
6701
6692
|
src0_clone->data = src0_buffer;
|
6702
|
-
if (src0->
|
6693
|
+
if (ggml_backend_buffer_is_host(src0->buffer)) {
|
6703
6694
|
memcpy(src0_clone->data, src0->data, src0_size);
|
6704
6695
|
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6705
|
-
} else if (src0->
|
6696
|
+
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
6706
6697
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6707
6698
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6708
6699
|
uint64_t offset = extra->offset;
|
@@ -6733,8 +6724,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6733
6724
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6734
6725
|
ggml_vk_print_tensor(ctx, src0, "src0");
|
6735
6726
|
}
|
6736
|
-
|
6737
|
-
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
|
6738
6727
|
}
|
6739
6728
|
if (src1 != nullptr) {
|
6740
6729
|
src1_clone = ggml_dup_tensor(ggml_ctx, src1);
|
@@ -6743,10 +6732,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6743
6732
|
|
6744
6733
|
src1_buffer = malloc(src1_size);
|
6745
6734
|
src1_clone->data = src1_buffer;
|
6746
|
-
if (src1->
|
6735
|
+
if (ggml_backend_buffer_is_host(src1->buffer)) {
|
6747
6736
|
memcpy(src1_clone->data, src1->data, src1_size);
|
6748
6737
|
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6749
|
-
} else if (src1->
|
6738
|
+
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
6750
6739
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6751
6740
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6752
6741
|
uint64_t offset = extra->offset;
|
@@ -6777,12 +6766,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6777
6766
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6778
6767
|
ggml_vk_print_tensor(ctx, src1, "src1");
|
6779
6768
|
std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
|
6780
|
-
std::cerr << "src1_clone=" << tensor << " src1_clone->
|
6769
|
+
std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
|
6781
6770
|
if (src1->src[0] != nullptr) {
|
6782
|
-
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << "
|
6771
|
+
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
|
6783
6772
|
}
|
6784
6773
|
if (src1->src[1] != nullptr) {
|
6785
|
-
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << "
|
6774
|
+
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
|
6786
6775
|
}
|
6787
6776
|
std::cerr << std::endl << "Result:" << std::endl;
|
6788
6777
|
ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
|
@@ -6793,8 +6782,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6793
6782
|
std::vector<const ggml_tensor *> done;
|
6794
6783
|
ggml_vk_print_graph_origin(src1_clone, done);
|
6795
6784
|
}
|
6796
|
-
|
6797
|
-
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
|
6798
6785
|
}
|
6799
6786
|
if (src2 != nullptr) {
|
6800
6787
|
src2_clone = ggml_dup_tensor(ggml_ctx, src2);
|
@@ -6803,18 +6790,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6803
6790
|
|
6804
6791
|
src2_buffer = malloc(src2_size);
|
6805
6792
|
src2_clone->data = src2_buffer;
|
6806
|
-
if (src2->
|
6793
|
+
if (ggml_backend_buffer_is_host(src2->buffer)) {
|
6807
6794
|
memcpy(src2_clone->data, src2->data, src2_size);
|
6808
6795
|
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6809
|
-
} else if (src2->
|
6796
|
+
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
6810
6797
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
6811
|
-
vk_buffer
|
6798
|
+
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6812
6799
|
uint64_t offset = extra->offset;
|
6813
6800
|
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
6814
6801
|
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
6815
6802
|
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
6816
6803
|
const int idx = i3*src2->ne[2] + i2;
|
6817
|
-
ggml_vk_buffer_read(ctx,
|
6804
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
|
6818
6805
|
}
|
6819
6806
|
}
|
6820
6807
|
|
@@ -6824,10 +6811,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6824
6811
|
src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
|
6825
6812
|
}
|
6826
6813
|
} else {
|
6827
|
-
if (offset + src2_size >=
|
6828
|
-
src2_size =
|
6814
|
+
if (offset + src2_size >= buffer_gpu->size) {
|
6815
|
+
src2_size = buffer_gpu->size - offset;
|
6829
6816
|
}
|
6830
|
-
ggml_vk_buffer_read(ctx,
|
6817
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
|
6831
6818
|
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6832
6819
|
}
|
6833
6820
|
} else {
|
@@ -6837,12 +6824,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6837
6824
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6838
6825
|
ggml_vk_print_tensor(ctx, src2, "src2");
|
6839
6826
|
std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
|
6840
|
-
std::cerr << "src2_clone=" << tensor << " src2_clone->
|
6827
|
+
std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
|
6841
6828
|
if (src2->src[0] != nullptr) {
|
6842
|
-
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << "
|
6829
|
+
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
|
6843
6830
|
}
|
6844
6831
|
if (src2->src[1] != nullptr) {
|
6845
|
-
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << "
|
6832
|
+
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
|
6846
6833
|
}
|
6847
6834
|
std::cerr << std::endl << "Result:" << std::endl;
|
6848
6835
|
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
|
@@ -6853,8 +6840,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6853
6840
|
std::vector<const ggml_tensor *> done;
|
6854
6841
|
ggml_vk_print_graph_origin(src2_clone, done);
|
6855
6842
|
}
|
6856
|
-
|
6857
|
-
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src2", src2_clone);
|
6858
6843
|
}
|
6859
6844
|
|
6860
6845
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
@@ -6875,7 +6860,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6875
6860
|
tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
|
6876
6861
|
} else if (tensor->op == GGML_OP_SOFT_MAX) {
|
6877
6862
|
if (src1 != nullptr) {
|
6878
|
-
tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone,
|
6863
|
+
tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
6879
6864
|
} else {
|
6880
6865
|
tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
|
6881
6866
|
}
|
@@ -6892,7 +6877,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6892
6877
|
float attn_factor = ((float *) tensor->op_params)[8];
|
6893
6878
|
float beta_fast = ((float *) tensor->op_params)[9];
|
6894
6879
|
float beta_slow = ((float *) tensor->op_params)[10];
|
6895
|
-
tensor_clone =
|
6880
|
+
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
6896
6881
|
} else if (tensor->op == GGML_OP_UNARY) {
|
6897
6882
|
switch (ggml_get_unary_op(tensor)) {
|
6898
6883
|
case GGML_UNARY_OP_SILU:
|
@@ -6935,17 +6920,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6935
6920
|
GGML_ASSERT(false);
|
6936
6921
|
}
|
6937
6922
|
|
6938
|
-
// Disable vulkan here to avoid the hooks in ggml.c
|
6939
|
-
ctx->disable = true;
|
6940
|
-
|
6941
6923
|
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
6942
6924
|
ggml_build_forward_expand(cgraph, tensor_clone);
|
6943
6925
|
|
6944
6926
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
6945
6927
|
|
6946
|
-
ctx->disable = false;
|
6947
|
-
|
6948
|
-
ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
|
6949
6928
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6950
6929
|
ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
|
6951
6930
|
}
|
@@ -6962,9 +6941,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6962
6941
|
if (src1 != nullptr) {
|
6963
6942
|
free(src1_buffer);
|
6964
6943
|
}
|
6965
|
-
if (src2 != nullptr) {
|
6966
|
-
free(src2_buffer);
|
6967
|
-
}
|
6968
6944
|
|
6969
6945
|
ggml_free(ggml_ctx);
|
6970
6946
|
}
|
@@ -6989,7 +6965,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6989
6965
|
|
6990
6966
|
void * tensor_data = tensor->data;
|
6991
6967
|
|
6992
|
-
if (tensor->
|
6968
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
6993
6969
|
size_t tensor_size = ggml_nbytes(tensor);
|
6994
6970
|
tensor_data = malloc(tensor_size);
|
6995
6971
|
|
@@ -7024,8 +7000,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7024
7000
|
} else if (tensor->type == GGML_TYPE_F16) {
|
7025
7001
|
correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
|
7026
7002
|
result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
7003
|
+
} else if (tensor->type == GGML_TYPE_I32) {
|
7004
|
+
correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
|
7005
|
+
result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
7027
7006
|
} else {
|
7028
|
-
std::cerr << "
|
7007
|
+
std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl;
|
7029
7008
|
}
|
7030
7009
|
} else {
|
7031
7010
|
std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
|
@@ -7034,12 +7013,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7034
7013
|
|
7035
7014
|
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
|
7036
7015
|
std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
|
7037
|
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
7016
|
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
7038
7017
|
if (src0 != nullptr) {
|
7039
|
-
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
7018
|
+
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
7040
7019
|
}
|
7041
7020
|
if (src1 != nullptr) {
|
7042
|
-
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
7021
|
+
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
7043
7022
|
}
|
7044
7023
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
7045
7024
|
std::cerr << std::endl << "Result:" << std::endl;
|
@@ -7075,12 +7054,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7075
7054
|
|
7076
7055
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
7077
7056
|
std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
7078
|
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
7057
|
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
7079
7058
|
if (src0 != nullptr) {
|
7080
|
-
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
7059
|
+
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
7081
7060
|
}
|
7082
7061
|
if (src1 != nullptr) {
|
7083
|
-
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
7062
|
+
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
7084
7063
|
}
|
7085
7064
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
7086
7065
|
std::cerr << std::endl << "Result:" << std::endl;
|
@@ -7099,12 +7078,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7099
7078
|
|
7100
7079
|
if (avg_err > 0.05 || std::isnan(avg_err)) {
|
7101
7080
|
std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
7102
|
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
7081
|
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
7103
7082
|
if (src0 != nullptr) {
|
7104
|
-
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
7083
|
+
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
7105
7084
|
}
|
7106
7085
|
if (src1 != nullptr) {
|
7107
|
-
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
7086
|
+
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
7108
7087
|
}
|
7109
7088
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
7110
7089
|
std::cerr << std::endl << "Result:" << std::endl;
|
@@ -7116,14 +7095,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7116
7095
|
ggml_vk_print_graph_origin(tensor, done);
|
7117
7096
|
GGML_ASSERT(false);
|
7118
7097
|
} else {
|
7119
|
-
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << "
|
7098
|
+
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
|
7120
7099
|
}
|
7121
7100
|
|
7122
7101
|
free(comp_result);
|
7123
7102
|
comp_result = nullptr;
|
7124
7103
|
comp_size = 0;
|
7125
7104
|
|
7126
|
-
if (tensor->
|
7105
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
7127
7106
|
free(tensor_data);
|
7128
7107
|
}
|
7129
7108
|
}
|