llama_cpp 0.15.2 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
@@ -114,6 +114,7 @@ struct vk_device {
|
|
114
114
|
size_t idx;
|
115
115
|
|
116
116
|
vk_matmul_pipeline pipeline_matmul_f32;
|
117
|
+
vk_matmul_pipeline pipeline_matmul_f32_f16;
|
117
118
|
vk_matmul_pipeline pipeline_matmul_f16;
|
118
119
|
vk_matmul_pipeline pipeline_matmul_f16_f32;
|
119
120
|
vk_pipeline pipeline_matmul_split_k_reduce;
|
@@ -289,12 +290,12 @@ struct vk_op_rope_neox_push_constants {
|
|
289
290
|
float corr_dims[4];
|
290
291
|
float theta_scale;
|
291
292
|
float inv_ndims;
|
293
|
+
uint32_t has_freq_facs;
|
292
294
|
};
|
293
295
|
|
294
296
|
struct vk_op_soft_max_push_constants {
|
295
297
|
uint32_t KX;
|
296
298
|
uint32_t KY;
|
297
|
-
uint32_t KZ;
|
298
299
|
float scale;
|
299
300
|
float max_bias;
|
300
301
|
float m0;
|
@@ -304,7 +305,8 @@ struct vk_op_soft_max_push_constants {
|
|
304
305
|
|
305
306
|
struct vk_op_argsort_push_constants {
|
306
307
|
uint32_t ncols;
|
307
|
-
|
308
|
+
uint32_t ncols_pad;
|
309
|
+
int32_t order;
|
308
310
|
};
|
309
311
|
|
310
312
|
// Allow pre-recording command buffers
|
@@ -375,13 +377,12 @@ struct ggml_backend_vk_context {
|
|
375
377
|
vk_context * compute_ctx;
|
376
378
|
vk_context * transfer_ctx;
|
377
379
|
|
378
|
-
bool disable;
|
379
380
|
bool initialized;
|
380
381
|
|
381
382
|
size_t idx;
|
382
383
|
};
|
383
384
|
|
384
|
-
struct
|
385
|
+
struct vk_instance_t {
|
385
386
|
vk::Instance instance;
|
386
387
|
|
387
388
|
std::vector<size_t> device_indices;
|
@@ -423,7 +424,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
423
424
|
typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
424
425
|
|
425
426
|
static bool vk_instance_initialized = false;
|
426
|
-
static
|
427
|
+
static vk_instance_t vk_instance;
|
427
428
|
|
428
429
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
429
430
|
|
@@ -1013,6 +1014,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1013
1014
|
uint32_t s_align = 32;
|
1014
1015
|
|
1015
1016
|
ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
1017
|
+
ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
1016
1018
|
ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
1017
1019
|
ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
1018
1020
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
@@ -1048,6 +1050,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1048
1050
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1049
1051
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1050
1052
|
|
1053
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1054
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1055
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1056
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1057
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1058
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1059
|
+
|
1051
1060
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1052
1061
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1053
1062
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
@@ -1230,6 +1239,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1230
1239
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1231
1240
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1232
1241
|
|
1242
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1243
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1244
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1245
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1246
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1247
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1248
|
+
|
1233
1249
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1234
1250
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1235
1251
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
@@ -1501,14 +1517,14 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1501
1517
|
|
1502
1518
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
|
1503
1519
|
|
1504
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main",
|
1505
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main",
|
1520
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1521
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
|
1506
1522
|
|
1507
1523
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1508
1524
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
1509
1525
|
|
1510
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main",
|
1511
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main",
|
1526
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
1527
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
1512
1528
|
|
1513
1529
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
1514
1530
|
}
|
@@ -1859,7 +1875,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1859
1875
|
ctx->compute_ctx = nullptr;
|
1860
1876
|
ctx->transfer_ctx = nullptr;
|
1861
1877
|
|
1862
|
-
ctx->disable = false;
|
1863
1878
|
ctx->initialized = true;
|
1864
1879
|
|
1865
1880
|
ctx->idx = idx;
|
@@ -1903,6 +1918,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
1903
1918
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
1904
1919
|
return ctx->device->pipeline_matmul_f32;
|
1905
1920
|
}
|
1921
|
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
1922
|
+
return ctx->device->pipeline_matmul_f32_f16;
|
1923
|
+
}
|
1906
1924
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
1907
1925
|
return ctx->device->pipeline_matmul_f16_f32;
|
1908
1926
|
}
|
@@ -2722,7 +2740,7 @@ static void ggml_vk_matmul(
|
|
2722
2740
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
2723
2741
|
uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
|
2724
2742
|
#ifdef GGML_VULKAN_DEBUG
|
2725
|
-
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
2743
|
+
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
2726
2744
|
#endif
|
2727
2745
|
ggml_vk_sync_buffers(subctx);
|
2728
2746
|
if (split_k == 1) {
|
@@ -2792,7 +2810,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
|
|
2792
2810
|
|
2793
2811
|
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
2794
2812
|
#ifdef GGML_VULKAN_DEBUG
|
2795
|
-
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ",
|
2813
|
+
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
2796
2814
|
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
|
2797
2815
|
#endif
|
2798
2816
|
const int tensor_type_size = ggml_type_size(tensor->type);
|
@@ -2812,9 +2830,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
|
|
2812
2830
|
|
2813
2831
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2814
2832
|
#ifdef GGML_VULKAN_DEBUG
|
2815
|
-
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
2816
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
2817
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
2833
|
+
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
2834
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
2835
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
2818
2836
|
#endif
|
2819
2837
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
2820
2838
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
@@ -2982,19 +3000,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2982
3000
|
ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
|
2983
3001
|
0, 0, 0, 0, 1
|
2984
3002
|
); // NOLINT
|
2985
|
-
|
2986
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
2987
|
-
// copy dst to host
|
2988
|
-
float * d = (float *) ((char *) dst->data);
|
2989
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
2990
|
-
}
|
2991
3003
|
}
|
2992
3004
|
|
2993
3005
|
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2994
3006
|
#ifdef GGML_VULKAN_DEBUG
|
2995
|
-
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
2996
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
2997
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3007
|
+
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3008
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3009
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
2998
3010
|
#endif
|
2999
3011
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
3000
3012
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
@@ -3147,12 +3159,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3147
3159
|
|
3148
3160
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3149
3161
|
#ifdef GGML_VULKAN_DEBUG
|
3150
|
-
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
3151
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
3152
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3162
|
+
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3163
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3164
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
3153
3165
|
#endif
|
3154
3166
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
3155
|
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
3156
3167
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
3157
3168
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
3158
3169
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -3217,25 +3228,17 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
3217
3228
|
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
3218
3229
|
ggml_vk_sync_buffers(subctx);
|
3219
3230
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
3220
|
-
|
3221
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3222
|
-
// copy dst to host
|
3223
|
-
float * d = (float *) dst->data;
|
3224
|
-
ggml_vk_sync_buffers(subctx);
|
3225
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
|
3226
|
-
}
|
3227
3231
|
}
|
3228
3232
|
|
3229
3233
|
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3230
3234
|
#ifdef GGML_VULKAN_DEBUG
|
3231
|
-
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
3232
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
3233
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3235
|
+
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3236
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3237
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
3234
3238
|
#endif
|
3235
3239
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
3236
3240
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
3237
3241
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
3238
|
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
3239
3242
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
3240
3243
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
3241
3244
|
|
@@ -3302,26 +3305,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
3302
3305
|
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
3303
3306
|
ggml_vk_sync_buffers(subctx);
|
3304
3307
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
3305
|
-
|
3306
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3307
|
-
// copy dst to host
|
3308
|
-
float * d = (float *) dst->data;
|
3309
|
-
ggml_vk_sync_buffers(subctx);
|
3310
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
|
3311
|
-
}
|
3312
|
-
}
|
3313
|
-
|
3314
|
-
static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
|
3315
|
-
const uint64_t ne10 = src1->ne[0];
|
3316
|
-
|
3317
|
-
const uint64_t ne0 = dst->ne[0];
|
3318
|
-
const uint64_t ne1 = dst->ne[1];
|
3319
|
-
|
3320
|
-
// TODO: find the optimal values for these
|
3321
|
-
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
3322
|
-
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
3323
|
-
dst->type == GGML_TYPE_F32 &&
|
3324
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
|
3325
3308
|
}
|
3326
3309
|
|
3327
3310
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3711,8 +3694,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
3711
3694
|
// TODO: support for transposed / permuted tensors
|
3712
3695
|
GGML_ASSERT(nb0 == sizeof(float));
|
3713
3696
|
GGML_ASSERT(nb00 == sizeof(float));
|
3714
|
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
3715
|
-
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
3716
3697
|
|
3717
3698
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
3718
3699
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
@@ -3834,7 +3815,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
3834
3815
|
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
3835
3816
|
return ctx->device->pipeline_soft_max_f32;
|
3836
3817
|
}
|
3837
|
-
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 &&
|
3818
|
+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
3838
3819
|
return ctx->device->pipeline_soft_max_f32_f16;
|
3839
3820
|
}
|
3840
3821
|
return nullptr;
|
@@ -3873,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
3873
3854
|
default:
|
3874
3855
|
return nullptr;
|
3875
3856
|
}
|
3857
|
+
|
3858
|
+
GGML_UNUSED(src2);
|
3876
3859
|
}
|
3877
3860
|
|
3878
3861
|
static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
@@ -3902,14 +3885,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
3902
3885
|
template<typename PC>
|
3903
3886
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
3904
3887
|
#ifdef GGML_VULKAN_DEBUG
|
3905
|
-
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
3888
|
+
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3906
3889
|
if (src1 != nullptr) {
|
3907
|
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
3890
|
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3908
3891
|
}
|
3909
3892
|
if (src2 != nullptr) {
|
3910
|
-
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ",
|
3893
|
+
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
3911
3894
|
}
|
3912
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
3895
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
3913
3896
|
#endif
|
3914
3897
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
3915
3898
|
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
@@ -3919,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3919
3902
|
const uint64_t ne02 = src0->ne[2];
|
3920
3903
|
const uint64_t ne03 = src0->ne[3];
|
3921
3904
|
const uint64_t ne0 = ne00 * ne01;
|
3905
|
+
|
3922
3906
|
const bool use_src1 = src1 != nullptr;
|
3923
3907
|
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
3924
3908
|
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
@@ -3926,11 +3910,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3926
3910
|
const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
|
3927
3911
|
const uint64_t ne1 = ne10 * ne11;
|
3928
3912
|
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
3929
|
-
const uint64_t nb2 = dst->nb[2];
|
3930
|
-
const uint64_t nb3 = dst->nb[3];
|
3931
3913
|
|
3932
3914
|
const bool use_src2 = src2 != nullptr;
|
3933
|
-
const uint64_t
|
3915
|
+
const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
|
3916
|
+
const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
|
3917
|
+
const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
|
3918
|
+
const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
|
3919
|
+
const uint64_t ne2 = ne20 * ne21;
|
3934
3920
|
|
3935
3921
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
3936
3922
|
ggml_vk_func_t op_func;
|
@@ -3976,7 +3962,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3976
3962
|
src1_uma = d_Y != nullptr;
|
3977
3963
|
}
|
3978
3964
|
if (use_src2) {
|
3979
|
-
ggml_vk_host_get(ctx,
|
3965
|
+
ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
|
3980
3966
|
src2_uma = d_Z != nullptr;
|
3981
3967
|
}
|
3982
3968
|
}
|
@@ -3989,7 +3975,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3989
3975
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
3990
3976
|
|
3991
3977
|
// Workaround for tiny tensor inputs on ROPE
|
3992
|
-
if (use_src1 &&
|
3978
|
+
if (use_src1 && y_sz > d_D->size) {
|
3993
3979
|
y_sz = VK_WHOLE_SIZE;
|
3994
3980
|
}
|
3995
3981
|
|
@@ -4006,7 +3992,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4006
3992
|
y_buf_offset = extra_src1->offset;
|
4007
3993
|
GGML_ASSERT(d_Y != nullptr);
|
4008
3994
|
}
|
4009
|
-
|
4010
3995
|
if (use_src2 && !src2_uma) {
|
4011
3996
|
d_Z = extra_src2->buffer_gpu.lock();
|
4012
3997
|
z_buf_offset = extra_src2->offset;
|
@@ -4016,6 +4001,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4016
4001
|
if (op_supports_incontiguous) {
|
4017
4002
|
x_sz = ggml_nbytes(src0);
|
4018
4003
|
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
4004
|
+
z_sz = use_src2 ? ggml_nbytes(src2) : 0;
|
4019
4005
|
d_sz = ggml_nbytes(dst);
|
4020
4006
|
|
4021
4007
|
if (x_buf_offset + x_sz >= d_X->size) {
|
@@ -4024,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4024
4010
|
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
4025
4011
|
y_sz = VK_WHOLE_SIZE;
|
4026
4012
|
}
|
4013
|
+
if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
|
4014
|
+
z_sz = VK_WHOLE_SIZE;
|
4015
|
+
}
|
4027
4016
|
if (d_buf_offset + d_sz >= d_D->size) {
|
4028
4017
|
d_sz = VK_WHOLE_SIZE;
|
4029
4018
|
}
|
@@ -4046,7 +4035,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4046
4035
|
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
4047
4036
|
break;
|
4048
4037
|
case GGML_OP_GET_ROWS:
|
4049
|
-
elements = {
|
4038
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
4039
|
+
break;
|
4040
|
+
case GGML_OP_ARGSORT:
|
4041
|
+
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
|
4050
4042
|
break;
|
4051
4043
|
default:
|
4052
4044
|
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
@@ -4060,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4060
4052
|
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
4061
4053
|
y_sz *= ne12 * ne13;
|
4062
4054
|
}
|
4055
|
+
if (use_src2 && z_sz != VK_WHOLE_SIZE) {
|
4056
|
+
z_sz *= ne22 * ne23;
|
4057
|
+
}
|
4063
4058
|
if (d_sz != VK_WHOLE_SIZE) {
|
4064
4059
|
d_sz *= ne02 * ne03;
|
4065
4060
|
}
|
4066
4061
|
}
|
4067
4062
|
|
4068
4063
|
if (op == GGML_OP_SOFT_MAX) {
|
4069
|
-
// Empty src1
|
4064
|
+
// Empty src1 is possible in soft_max, but the shader needs a buffer
|
4070
4065
|
vk_subbuffer subbuf_y;
|
4071
4066
|
if (use_src1) {
|
4072
4067
|
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
@@ -4074,15 +4069,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4074
4069
|
subbuf_y = { d_X, 0, d_X->size };
|
4075
4070
|
}
|
4076
4071
|
|
4077
|
-
|
4078
|
-
|
4079
|
-
|
4072
|
+
ggml_vk_sync_buffers(subctx);
|
4073
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4074
|
+
} else if (op == GGML_OP_ROPE) {
|
4075
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
4076
|
+
const bool is_neox = mode & 2;
|
4077
|
+
|
4078
|
+
if (is_neox) {
|
4079
|
+
// Empty src2 is possible in rope, but the shader needs a buffer
|
4080
|
+
vk_subbuffer subbuf_z;
|
4081
|
+
if (use_src2) {
|
4082
|
+
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
4083
|
+
} else {
|
4084
|
+
subbuf_z = { d_X, 0, d_X->size };
|
4085
|
+
}
|
4086
|
+
|
4087
|
+
ggml_vk_sync_buffers(subctx);
|
4088
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4080
4089
|
} else {
|
4081
|
-
|
4090
|
+
ggml_vk_sync_buffers(subctx);
|
4091
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4082
4092
|
}
|
4083
|
-
|
4093
|
+
} else if (use_src2) {
|
4084
4094
|
ggml_vk_sync_buffers(subctx);
|
4085
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz },
|
4095
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4086
4096
|
} else if (use_src1) {
|
4087
4097
|
ggml_vk_sync_buffers(subctx);
|
4088
4098
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
@@ -4090,22 +4100,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4090
4100
|
ggml_vk_sync_buffers(subctx);
|
4091
4101
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4092
4102
|
}
|
4093
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
|
4094
|
-
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
4095
|
-
} else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
|
4096
|
-
// copy dst to host
|
4097
|
-
float * d = (float *) dst->data;
|
4098
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
4099
|
-
}
|
4100
4103
|
} else {
|
4101
4104
|
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
4105
|
+
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
4106
|
+
GGML_ASSERT(!use_src2);
|
4102
4107
|
|
4103
4108
|
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
|
4104
4109
|
|
4105
4110
|
switch (dst->op) {
|
4106
4111
|
case GGML_OP_NORM:
|
4107
4112
|
case GGML_OP_RMS_NORM:
|
4108
|
-
case GGML_OP_SOFT_MAX:
|
4109
4113
|
elements = { (uint32_t)ne01, 1, 1 };
|
4110
4114
|
break;
|
4111
4115
|
case GGML_OP_DIAG_MASK_INF:
|
@@ -4135,10 +4139,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
4135
4139
|
ggml_vk_sync_buffers(subctx);
|
4136
4140
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
4137
4141
|
}
|
4138
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
4139
|
-
// copy dst to host
|
4140
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
4141
|
-
}
|
4142
4142
|
}
|
4143
4143
|
}
|
4144
4144
|
}
|
@@ -4269,7 +4269,7 @@ static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * su
|
|
4269
4269
|
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
4270
4270
|
}
|
4271
4271
|
|
4272
|
-
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1,
|
4272
|
+
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
4273
4273
|
float * op_params = (float *)dst->op_params;
|
4274
4274
|
|
4275
4275
|
float scale = op_params[0];
|
@@ -4285,20 +4285,16 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
4285
4285
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
4286
4286
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
4287
4287
|
|
4288
|
-
|
4289
|
-
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
|
4290
|
-
|
4291
|
-
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
|
4288
|
+
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
|
4292
4289
|
ncols,
|
4293
4290
|
src1 != nullptr ? nrows_y : (uint32_t)0,
|
4294
|
-
src2 != nullptr ? (uint32_t)1 : (uint32_t)0,
|
4295
4291
|
scale, max_bias,
|
4296
4292
|
m0, m1,
|
4297
4293
|
n_head_log2,
|
4298
4294
|
});
|
4299
4295
|
}
|
4300
4296
|
|
4301
|
-
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
4297
|
+
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
4302
4298
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
4303
4299
|
const int mode = ((int32_t *) dst->op_params)[2];
|
4304
4300
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
@@ -4321,15 +4317,40 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
4321
4317
|
if (is_neox) {
|
4322
4318
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
4323
4319
|
const float inv_ndims = -1.0f / n_dims;
|
4324
|
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1,
|
4320
|
+
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
4321
|
+
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
4322
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
|
4323
|
+
src2 != nullptr,
|
4324
|
+
});
|
4325
4325
|
} else {
|
4326
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1,
|
4326
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
4327
|
+
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
4328
|
+
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
4329
|
+
});
|
4327
4330
|
}
|
4328
4331
|
}
|
4329
4332
|
|
4330
4333
|
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
4331
4334
|
int32_t * op_params = (int32_t *)dst->op_params;
|
4332
|
-
|
4335
|
+
|
4336
|
+
uint32_t ncols = src0->ne[0];
|
4337
|
+
|
4338
|
+
uint32_t ncols_pad = 1;
|
4339
|
+
while (ncols_pad < ncols) {
|
4340
|
+
ncols_pad *= 2;
|
4341
|
+
}
|
4342
|
+
|
4343
|
+
GGML_ASSERT(ncols_pad <= 1024);
|
4344
|
+
|
4345
|
+
std::cerr << "ncols=" << ncols << " ncols_pad=" << ncols_pad << " ascending=" << op_params[0] << std::endl;
|
4346
|
+
|
4347
|
+
std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
|
4348
|
+
|
4349
|
+
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
|
4350
|
+
ncols,
|
4351
|
+
ncols_pad,
|
4352
|
+
op_params[0],
|
4353
|
+
});
|
4333
4354
|
}
|
4334
4355
|
|
4335
4356
|
#ifdef GGML_VULKAN_RUN_TESTS
|
@@ -4381,6 +4402,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4381
4402
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4382
4403
|
p = ctx->device->pipeline_matmul_f32->a_s;
|
4383
4404
|
shname = "F32_ALIGNED_S";
|
4405
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4406
|
+
p = ctx->device->pipeline_matmul_f32_f16->a_s;
|
4407
|
+
shname = "F32_F16_ALIGNED_S";
|
4384
4408
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4385
4409
|
p = ctx->device->pipeline_matmul_f16_f32->a_s;
|
4386
4410
|
shname = "F16_F32_ALIGNED_S";
|
@@ -4394,6 +4418,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4394
4418
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4395
4419
|
p = ctx->device->pipeline_matmul_f32->a_m;
|
4396
4420
|
shname = "F32_ALIGNED_M";
|
4421
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4422
|
+
p = ctx->device->pipeline_matmul_f32_f16->a_m;
|
4423
|
+
shname = "F32_F16_ALIGNED_M";
|
4397
4424
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4398
4425
|
p = ctx->device->pipeline_matmul_f16_f32->a_m;
|
4399
4426
|
shname = "F16_F32_ALIGNED_M";
|
@@ -4407,6 +4434,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4407
4434
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4408
4435
|
p = ctx->device->pipeline_matmul_f32->a_l;
|
4409
4436
|
shname = "F32_ALIGNED_L";
|
4437
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4438
|
+
p = ctx->device->pipeline_matmul_f32_f16->a_l;
|
4439
|
+
shname = "F32_F16_ALIGNED_L";
|
4410
4440
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4411
4441
|
p = ctx->device->pipeline_matmul_f16_f32->a_l;
|
4412
4442
|
shname = "F16_F32_ALIGNED_L";
|
@@ -4427,6 +4457,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4427
4457
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4428
4458
|
p = ctx->device->pipeline_matmul_f32->s;
|
4429
4459
|
shname = "F32_S";
|
4460
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4461
|
+
p = ctx->device->pipeline_matmul_f32_f16->s;
|
4462
|
+
shname = "F32_F16_S";
|
4430
4463
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4431
4464
|
p = ctx->device->pipeline_matmul_f16_f32->s;
|
4432
4465
|
shname = "F16_F32_S";
|
@@ -4438,6 +4471,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4438
4471
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4439
4472
|
p = ctx->device->pipeline_matmul_f32->m;
|
4440
4473
|
shname = "F32_M";
|
4474
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4475
|
+
p = ctx->device->pipeline_matmul_f32_f16->m;
|
4476
|
+
shname = "F32_F16_M";
|
4441
4477
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4442
4478
|
p = ctx->device->pipeline_matmul_f16_f32->m;
|
4443
4479
|
shname = "F16_F32_M";
|
@@ -4449,6 +4485,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4449
4485
|
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4450
4486
|
p = ctx->device->pipeline_matmul_f32->l;
|
4451
4487
|
shname = "F32_L";
|
4488
|
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
4489
|
+
p = ctx->device->pipeline_matmul_f32_f16->l;
|
4490
|
+
shname = "F32_F16_L";
|
4452
4491
|
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
4453
4492
|
p = ctx->device->pipeline_matmul_f16_f32->l;
|
4454
4493
|
shname = "F16_F32_L";
|
@@ -4561,15 +4600,11 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
4561
4600
|
src1_ggml->data = y;
|
4562
4601
|
tensor_ggml->data = d_chk;
|
4563
4602
|
|
4564
|
-
ctx->disable = true;
|
4565
|
-
|
4566
4603
|
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
4567
4604
|
ggml_build_forward_expand(cgraph, tensor_ggml);
|
4568
4605
|
|
4569
4606
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
4570
4607
|
|
4571
|
-
ctx->disable = false;
|
4572
|
-
|
4573
4608
|
ggml_free(ggml_ctx);
|
4574
4609
|
|
4575
4610
|
double avg_err = 0.0;
|
@@ -5049,15 +5084,11 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
5049
5084
|
src1_ggml->data = y;
|
5050
5085
|
tensor_ggml->data = d_chk;
|
5051
5086
|
|
5052
|
-
ctx->disable = true;
|
5053
|
-
|
5054
5087
|
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
5055
5088
|
ggml_build_forward_expand(cgraph, tensor_ggml);
|
5056
5089
|
|
5057
5090
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
5058
5091
|
|
5059
|
-
ctx->disable = false;
|
5060
|
-
|
5061
5092
|
ggml_free(ggml_ctx);
|
5062
5093
|
|
5063
5094
|
double avg_err = 0.0;
|
@@ -5134,12 +5165,12 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5134
5165
|
#ifdef GGML_VULKAN_DEBUG
|
5135
5166
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
5136
5167
|
#endif
|
5137
|
-
|
5168
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5169
|
+
|
5170
|
+
if (extra == nullptr) {
|
5138
5171
|
return;
|
5139
5172
|
}
|
5140
5173
|
|
5141
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5142
|
-
|
5143
5174
|
ggml_tensor * src0 = node->src[0];
|
5144
5175
|
ggml_tensor * src1 = node->src[1];
|
5145
5176
|
|
@@ -5244,9 +5275,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5244
5275
|
}
|
5245
5276
|
|
5246
5277
|
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
5247
|
-
if (ctx->disable) {
|
5248
|
-
return;
|
5249
|
-
}
|
5250
5278
|
#ifdef GGML_VULKAN_DEBUG
|
5251
5279
|
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
5252
5280
|
#endif
|
@@ -5420,7 +5448,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5420
5448
|
}
|
5421
5449
|
|
5422
5450
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
5423
|
-
|
5451
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5452
|
+
|
5453
|
+
if (ggml_is_empty(node) || extra == nullptr) {
|
5424
5454
|
return;
|
5425
5455
|
}
|
5426
5456
|
|
@@ -5434,8 +5464,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5434
5464
|
const ggml_tensor * src1 = node->src[1];
|
5435
5465
|
const ggml_tensor * src2 = node->src[2];
|
5436
5466
|
|
5437
|
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5438
|
-
|
5439
5467
|
switch (node->op) {
|
5440
5468
|
case GGML_OP_UNARY:
|
5441
5469
|
switch (ggml_get_unary_op(node)) {
|
@@ -5547,11 +5575,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5547
5575
|
|
5548
5576
|
break;
|
5549
5577
|
case GGML_OP_SOFT_MAX:
|
5550
|
-
ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1,
|
5578
|
+
ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
|
5551
5579
|
|
5552
5580
|
break;
|
5553
5581
|
case GGML_OP_ROPE:
|
5554
|
-
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
|
5582
|
+
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
|
5555
5583
|
|
5556
5584
|
break;
|
5557
5585
|
case GGML_OP_ARGSORT:
|
@@ -5580,7 +5608,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5580
5608
|
last_node = true;
|
5581
5609
|
#endif
|
5582
5610
|
|
5583
|
-
if (
|
5611
|
+
if (last_node) {
|
5584
5612
|
ggml_vk_ctx_end(ctx->compute_ctx);
|
5585
5613
|
ctx->compute_ctx->exit_tensor = node;
|
5586
5614
|
ctx->compute_ctx = nullptr;
|
@@ -5588,10 +5616,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5588
5616
|
}
|
5589
5617
|
|
5590
5618
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
5591
|
-
if (ctx->disable) {
|
5592
|
-
return false;
|
5593
|
-
}
|
5594
|
-
|
5595
5619
|
ggml_tensor_extra_gpu * extra = nullptr;
|
5596
5620
|
|
5597
5621
|
switch (tensor->op) {
|
@@ -5650,7 +5674,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5650
5674
|
}
|
5651
5675
|
|
5652
5676
|
#ifdef GGML_VULKAN_DEBUG
|
5653
|
-
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ",
|
5677
|
+
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
|
5654
5678
|
#endif
|
5655
5679
|
|
5656
5680
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
@@ -5690,9 +5714,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5690
5714
|
|
5691
5715
|
// Clean up after graph processing is done
|
5692
5716
|
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
5693
|
-
if (ctx->disable) {
|
5694
|
-
return;
|
5695
|
-
}
|
5696
5717
|
#ifdef GGML_VULKAN_DEBUG
|
5697
5718
|
std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
|
5698
5719
|
#endif
|
@@ -5865,7 +5886,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
5865
5886
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
5866
5887
|
}
|
5867
5888
|
|
5868
|
-
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
5869
5889
|
tensor->extra = extra;
|
5870
5890
|
}
|
5871
5891
|
|
@@ -5873,8 +5893,6 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
5873
5893
|
#ifdef GGML_VULKAN_DEBUG
|
5874
5894
|
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
5875
5895
|
#endif
|
5876
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
5877
|
-
|
5878
5896
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
5879
5897
|
|
5880
5898
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -5888,8 +5906,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
5888
5906
|
#ifdef GGML_VULKAN_DEBUG
|
5889
5907
|
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
5890
5908
|
#endif
|
5891
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
5892
|
-
|
5893
5909
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
5894
5910
|
|
5895
5911
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -6032,6 +6048,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
|
|
6032
6048
|
#ifdef GGML_VULKAN_DEBUG
|
6033
6049
|
std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
6034
6050
|
#endif
|
6051
|
+
size += 32; // Behave like the CPU buffer type
|
6035
6052
|
void * ptr = nullptr;
|
6036
6053
|
try {
|
6037
6054
|
ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
|
@@ -6119,7 +6136,6 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
6119
6136
|
#endif
|
6120
6137
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6121
6138
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6122
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
6123
6139
|
|
6124
6140
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6125
6141
|
|
@@ -6140,7 +6156,6 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
6140
6156
|
#endif
|
6141
6157
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6142
6158
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6143
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
6144
6159
|
|
6145
6160
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6146
6161
|
|
@@ -6206,6 +6221,10 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
6206
6221
|
ctx->transfer_ctx = nullptr;
|
6207
6222
|
}
|
6208
6223
|
|
6224
|
+
static bool ggml_vk_is_empty(ggml_tensor * node) {
|
6225
|
+
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
6226
|
+
}
|
6227
|
+
|
6209
6228
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
6210
6229
|
#ifdef GGML_VULKAN_DEBUG
|
6211
6230
|
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
@@ -6220,7 +6239,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
6220
6239
|
int last_node = cgraph->n_nodes - 1;
|
6221
6240
|
|
6222
6241
|
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
6223
|
-
while (last_node > 0 && (cgraph->nodes[last_node]
|
6242
|
+
while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
|
6224
6243
|
last_node -= 1;
|
6225
6244
|
}
|
6226
6245
|
|
@@ -6234,7 +6253,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
6234
6253
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
6235
6254
|
ggml_tensor * node = cgraph->nodes[i];
|
6236
6255
|
|
6237
|
-
if (
|
6256
|
+
if (ggml_vk_is_empty(node)) {
|
6238
6257
|
continue;
|
6239
6258
|
}
|
6240
6259
|
|
@@ -6536,7 +6555,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
|
|
6536
6555
|
for (int j = 0; j < level; j++) {
|
6537
6556
|
std::cerr << " ";
|
6538
6557
|
}
|
6539
|
-
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) <<
|
6558
|
+
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
|
6540
6559
|
|
6541
6560
|
done.push_back(tensor);
|
6542
6561
|
|
@@ -6548,7 +6567,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
|
|
6548
6567
|
}
|
6549
6568
|
|
6550
6569
|
static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
|
6551
|
-
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
6570
|
+
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) {
|
6552
6571
|
return;
|
6553
6572
|
}
|
6554
6573
|
i0 = std::max(i0, 5);
|
@@ -6569,6 +6588,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
6569
6588
|
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
6570
6589
|
} else if (tensor->type == GGML_TYPE_F16) {
|
6571
6590
|
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
6591
|
+
} else if (tensor->type == GGML_TYPE_I32) {
|
6592
|
+
val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
6572
6593
|
} else {
|
6573
6594
|
GGML_ASSERT(false);
|
6574
6595
|
}
|
@@ -6584,7 +6605,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
6584
6605
|
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
6585
6606
|
void * tensor_data = tensor->data;
|
6586
6607
|
|
6587
|
-
if (tensor->
|
6608
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
6588
6609
|
const size_t tensor_size = ggml_nbytes(tensor);
|
6589
6610
|
tensor_data = malloc(tensor_size);
|
6590
6611
|
|
@@ -6595,12 +6616,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
6595
6616
|
}
|
6596
6617
|
|
6597
6618
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
6598
|
-
std::cerr << "tensor=" << tensor << " tensor->
|
6619
|
+
std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
6599
6620
|
if (tensor->src[0] != nullptr) {
|
6600
|
-
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << "
|
6621
|
+
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
|
6601
6622
|
}
|
6602
6623
|
if (tensor->src[1] != nullptr) {
|
6603
|
-
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << "
|
6624
|
+
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
6604
6625
|
}
|
6605
6626
|
std::cerr << std::endl << "Result:" << std::endl;
|
6606
6627
|
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
@@ -6611,43 +6632,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
|
6611
6632
|
std::vector<const ggml_tensor *> done;
|
6612
6633
|
ggml_vk_print_graph_origin(tensor, done);
|
6613
6634
|
|
6614
|
-
if (tensor->
|
6635
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
6615
6636
|
free(tensor_data);
|
6616
6637
|
}
|
6617
6638
|
}
|
6618
6639
|
|
6619
|
-
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
6620
|
-
return;
|
6621
|
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
6622
|
-
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
6623
|
-
return;
|
6624
|
-
}
|
6625
|
-
for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
|
6626
|
-
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
6627
|
-
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
6628
|
-
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
6629
|
-
float val = 0.0f;
|
6630
|
-
if (tensor->type == GGML_TYPE_F32) {
|
6631
|
-
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
6632
|
-
} else if (tensor->type == GGML_TYPE_F16) {
|
6633
|
-
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
6634
|
-
}
|
6635
|
-
if (std::isnan(val)) {
|
6636
|
-
std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
|
6637
|
-
std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
6638
|
-
std::cerr << std::endl;
|
6639
|
-
ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
|
6640
|
-
std::cerr << std::endl;
|
6641
|
-
std::vector<const ggml_tensor *> done;
|
6642
|
-
ggml_vk_print_graph_origin(tensor, done);
|
6643
|
-
GGML_ASSERT(false);
|
6644
|
-
}
|
6645
|
-
}
|
6646
|
-
}
|
6647
|
-
}
|
6648
|
-
}
|
6649
|
-
}
|
6650
|
-
|
6651
6640
|
void * comp_result;
|
6652
6641
|
size_t comp_size;
|
6653
6642
|
size_t comp_nb[GGML_MAX_DIMS];
|
@@ -6701,10 +6690,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6701
6690
|
|
6702
6691
|
src0_buffer = malloc(src0_size);
|
6703
6692
|
src0_clone->data = src0_buffer;
|
6704
|
-
if (src0->
|
6693
|
+
if (ggml_backend_buffer_is_host(src0->buffer)) {
|
6705
6694
|
memcpy(src0_clone->data, src0->data, src0_size);
|
6706
6695
|
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6707
|
-
} else if (src0->
|
6696
|
+
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
6708
6697
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6709
6698
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6710
6699
|
uint64_t offset = extra->offset;
|
@@ -6735,8 +6724,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6735
6724
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6736
6725
|
ggml_vk_print_tensor(ctx, src0, "src0");
|
6737
6726
|
}
|
6738
|
-
|
6739
|
-
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
|
6740
6727
|
}
|
6741
6728
|
if (src1 != nullptr) {
|
6742
6729
|
src1_clone = ggml_dup_tensor(ggml_ctx, src1);
|
@@ -6745,10 +6732,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6745
6732
|
|
6746
6733
|
src1_buffer = malloc(src1_size);
|
6747
6734
|
src1_clone->data = src1_buffer;
|
6748
|
-
if (src1->
|
6735
|
+
if (ggml_backend_buffer_is_host(src1->buffer)) {
|
6749
6736
|
memcpy(src1_clone->data, src1->data, src1_size);
|
6750
6737
|
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6751
|
-
} else if (src1->
|
6738
|
+
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
6752
6739
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6753
6740
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6754
6741
|
uint64_t offset = extra->offset;
|
@@ -6779,12 +6766,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6779
6766
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6780
6767
|
ggml_vk_print_tensor(ctx, src1, "src1");
|
6781
6768
|
std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
|
6782
|
-
std::cerr << "src1_clone=" << tensor << " src1_clone->
|
6769
|
+
std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
|
6783
6770
|
if (src1->src[0] != nullptr) {
|
6784
|
-
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << "
|
6771
|
+
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
|
6785
6772
|
}
|
6786
6773
|
if (src1->src[1] != nullptr) {
|
6787
|
-
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << "
|
6774
|
+
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
|
6788
6775
|
}
|
6789
6776
|
std::cerr << std::endl << "Result:" << std::endl;
|
6790
6777
|
ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
|
@@ -6795,8 +6782,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6795
6782
|
std::vector<const ggml_tensor *> done;
|
6796
6783
|
ggml_vk_print_graph_origin(src1_clone, done);
|
6797
6784
|
}
|
6798
|
-
|
6799
|
-
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
|
6800
6785
|
}
|
6801
6786
|
if (src2 != nullptr) {
|
6802
6787
|
src2_clone = ggml_dup_tensor(ggml_ctx, src2);
|
@@ -6805,18 +6790,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6805
6790
|
|
6806
6791
|
src2_buffer = malloc(src2_size);
|
6807
6792
|
src2_clone->data = src2_buffer;
|
6808
|
-
if (src2->
|
6793
|
+
if (ggml_backend_buffer_is_host(src2->buffer)) {
|
6809
6794
|
memcpy(src2_clone->data, src2->data, src2_size);
|
6810
6795
|
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6811
|
-
} else if (src2->
|
6796
|
+
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
6812
6797
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
6813
|
-
vk_buffer
|
6798
|
+
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
6814
6799
|
uint64_t offset = extra->offset;
|
6815
6800
|
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
6816
6801
|
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
6817
6802
|
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
6818
6803
|
const int idx = i3*src2->ne[2] + i2;
|
6819
|
-
ggml_vk_buffer_read(ctx,
|
6804
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
|
6820
6805
|
}
|
6821
6806
|
}
|
6822
6807
|
|
@@ -6826,10 +6811,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6826
6811
|
src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
|
6827
6812
|
}
|
6828
6813
|
} else {
|
6829
|
-
if (offset + src2_size >=
|
6830
|
-
src2_size =
|
6814
|
+
if (offset + src2_size >= buffer_gpu->size) {
|
6815
|
+
src2_size = buffer_gpu->size - offset;
|
6831
6816
|
}
|
6832
|
-
ggml_vk_buffer_read(ctx,
|
6817
|
+
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
|
6833
6818
|
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
6834
6819
|
}
|
6835
6820
|
} else {
|
@@ -6839,12 +6824,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6839
6824
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6840
6825
|
ggml_vk_print_tensor(ctx, src2, "src2");
|
6841
6826
|
std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
|
6842
|
-
std::cerr << "src2_clone=" << tensor << " src2_clone->
|
6827
|
+
std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
|
6843
6828
|
if (src2->src[0] != nullptr) {
|
6844
|
-
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << "
|
6829
|
+
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
|
6845
6830
|
}
|
6846
6831
|
if (src2->src[1] != nullptr) {
|
6847
|
-
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << "
|
6832
|
+
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
|
6848
6833
|
}
|
6849
6834
|
std::cerr << std::endl << "Result:" << std::endl;
|
6850
6835
|
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
|
@@ -6855,8 +6840,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6855
6840
|
std::vector<const ggml_tensor *> done;
|
6856
6841
|
ggml_vk_print_graph_origin(src2_clone, done);
|
6857
6842
|
}
|
6858
|
-
|
6859
|
-
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src2", src2_clone);
|
6860
6843
|
}
|
6861
6844
|
|
6862
6845
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
@@ -6877,7 +6860,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6877
6860
|
tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
|
6878
6861
|
} else if (tensor->op == GGML_OP_SOFT_MAX) {
|
6879
6862
|
if (src1 != nullptr) {
|
6880
|
-
tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone,
|
6863
|
+
tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
6881
6864
|
} else {
|
6882
6865
|
tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
|
6883
6866
|
}
|
@@ -6894,7 +6877,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6894
6877
|
float attn_factor = ((float *) tensor->op_params)[8];
|
6895
6878
|
float beta_fast = ((float *) tensor->op_params)[9];
|
6896
6879
|
float beta_slow = ((float *) tensor->op_params)[10];
|
6897
|
-
tensor_clone =
|
6880
|
+
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
6898
6881
|
} else if (tensor->op == GGML_OP_UNARY) {
|
6899
6882
|
switch (ggml_get_unary_op(tensor)) {
|
6900
6883
|
case GGML_UNARY_OP_SILU:
|
@@ -6937,17 +6920,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6937
6920
|
GGML_ASSERT(false);
|
6938
6921
|
}
|
6939
6922
|
|
6940
|
-
// Disable vulkan here to avoid the hooks in ggml.c
|
6941
|
-
ctx->disable = true;
|
6942
|
-
|
6943
6923
|
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
6944
6924
|
ggml_build_forward_expand(cgraph, tensor_clone);
|
6945
6925
|
|
6946
6926
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
6947
6927
|
|
6948
|
-
ctx->disable = false;
|
6949
|
-
|
6950
|
-
ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
|
6951
6928
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
6952
6929
|
ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
|
6953
6930
|
}
|
@@ -6964,9 +6941,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6964
6941
|
if (src1 != nullptr) {
|
6965
6942
|
free(src1_buffer);
|
6966
6943
|
}
|
6967
|
-
if (src2 != nullptr) {
|
6968
|
-
free(src2_buffer);
|
6969
|
-
}
|
6970
6944
|
|
6971
6945
|
ggml_free(ggml_ctx);
|
6972
6946
|
}
|
@@ -6991,7 +6965,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6991
6965
|
|
6992
6966
|
void * tensor_data = tensor->data;
|
6993
6967
|
|
6994
|
-
if (tensor->
|
6968
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
6995
6969
|
size_t tensor_size = ggml_nbytes(tensor);
|
6996
6970
|
tensor_data = malloc(tensor_size);
|
6997
6971
|
|
@@ -7026,8 +7000,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7026
7000
|
} else if (tensor->type == GGML_TYPE_F16) {
|
7027
7001
|
correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
|
7028
7002
|
result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
7003
|
+
} else if (tensor->type == GGML_TYPE_I32) {
|
7004
|
+
correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
|
7005
|
+
result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
7029
7006
|
} else {
|
7030
|
-
std::cerr << "
|
7007
|
+
std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl;
|
7031
7008
|
}
|
7032
7009
|
} else {
|
7033
7010
|
std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
|
@@ -7036,12 +7013,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7036
7013
|
|
7037
7014
|
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
|
7038
7015
|
std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
|
7039
|
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
7016
|
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
7040
7017
|
if (src0 != nullptr) {
|
7041
|
-
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
7018
|
+
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
7042
7019
|
}
|
7043
7020
|
if (src1 != nullptr) {
|
7044
|
-
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
7021
|
+
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
7045
7022
|
}
|
7046
7023
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
7047
7024
|
std::cerr << std::endl << "Result:" << std::endl;
|
@@ -7077,12 +7054,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7077
7054
|
|
7078
7055
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
7079
7056
|
std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
7080
|
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
7057
|
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
7081
7058
|
if (src0 != nullptr) {
|
7082
|
-
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
7059
|
+
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
7083
7060
|
}
|
7084
7061
|
if (src1 != nullptr) {
|
7085
|
-
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
7062
|
+
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
7086
7063
|
}
|
7087
7064
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
7088
7065
|
std::cerr << std::endl << "Result:" << std::endl;
|
@@ -7101,12 +7078,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7101
7078
|
|
7102
7079
|
if (avg_err > 0.05 || std::isnan(avg_err)) {
|
7103
7080
|
std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
7104
|
-
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->
|
7081
|
+
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
7105
7082
|
if (src0 != nullptr) {
|
7106
|
-
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << "
|
7083
|
+
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
7107
7084
|
}
|
7108
7085
|
if (src1 != nullptr) {
|
7109
|
-
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << "
|
7086
|
+
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
7110
7087
|
}
|
7111
7088
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
7112
7089
|
std::cerr << std::endl << "Result:" << std::endl;
|
@@ -7118,14 +7095,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7118
7095
|
ggml_vk_print_graph_origin(tensor, done);
|
7119
7096
|
GGML_ASSERT(false);
|
7120
7097
|
} else {
|
7121
|
-
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << "
|
7098
|
+
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
|
7122
7099
|
}
|
7123
7100
|
|
7124
7101
|
free(comp_result);
|
7125
7102
|
comp_result = nullptr;
|
7126
7103
|
comp_size = 0;
|
7127
7104
|
|
7128
|
-
if (tensor->
|
7105
|
+
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
7129
7106
|
free(tensor_data);
|
7130
7107
|
}
|
7131
7108
|
}
|