llama_cpp 0.14.2 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -9,7 +9,6 @@
|
|
9
9
|
#include <algorithm>
|
10
10
|
#include <cmath>
|
11
11
|
#include <iostream>
|
12
|
-
#include <iomanip>
|
13
12
|
#include <limits>
|
14
13
|
#include <tuple>
|
15
14
|
#include <vector>
|
@@ -340,8 +339,8 @@ struct ggml_backend_vk_context {
|
|
340
339
|
size_t semaphore_idx, event_idx;
|
341
340
|
ggml_vk_garbage_collector gc;
|
342
341
|
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
343
|
-
size_t
|
344
|
-
vk_buffer
|
342
|
+
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
|
343
|
+
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
|
345
344
|
vk::Fence fence;
|
346
345
|
vk_buffer staging;
|
347
346
|
size_t staging_size;
|
@@ -710,6 +709,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
|
710
709
|
}
|
711
710
|
}
|
712
711
|
|
712
|
+
// All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
|
713
|
+
// Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
|
714
|
+
if (compute_index >= 0) {
|
715
|
+
return compute_index;
|
716
|
+
}
|
717
|
+
|
713
718
|
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
|
714
719
|
|
715
720
|
for(auto &q_family : queue_family_props) {
|
@@ -803,7 +808,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|
803
808
|
|
804
809
|
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
805
810
|
#ifdef GGML_VULKAN_DEBUG
|
806
|
-
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
811
|
+
std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
807
812
|
#endif
|
808
813
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
809
814
|
|
@@ -992,6 +997,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
992
997
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
993
998
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
|
994
999
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
1000
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1001
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1002
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1003
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1004
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
995
1005
|
|
996
1006
|
if (device->fp16) {
|
997
1007
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
@@ -1049,6 +1059,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1049
1059
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1050
1060
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1051
1061
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1062
|
+
|
1063
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1064
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1065
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1066
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1067
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1068
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1069
|
+
|
1070
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1071
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1072
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1073
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1074
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1075
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1076
|
+
|
1077
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1078
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1079
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1080
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1081
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1082
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1083
|
+
|
1084
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1085
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1086
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1087
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1088
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1089
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1090
|
+
|
1091
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1092
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1093
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1094
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1095
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1096
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1052
1097
|
} else {
|
1053
1098
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
1054
1099
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
@@ -1105,6 +1150,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1105
1150
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1106
1151
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1107
1152
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1153
|
+
|
1154
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1155
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1156
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1157
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1158
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1159
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1160
|
+
|
1161
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1162
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1163
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1164
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1165
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1166
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1167
|
+
|
1168
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1169
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1170
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1171
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1172
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1173
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1174
|
+
|
1175
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1176
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1177
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1178
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1179
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1180
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1181
|
+
|
1182
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1183
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1184
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1185
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1186
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1187
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1108
1188
|
}
|
1109
1189
|
|
1110
1190
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
@@ -1133,19 +1213,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1133
1213
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1134
1214
|
|
1135
1215
|
// get_rows
|
1136
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1137
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1138
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1139
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1140
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1141
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1142
|
-
|
1143
|
-
|
1144
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1145
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1146
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1147
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1148
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1216
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1217
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1218
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1219
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1220
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1221
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1222
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1223
|
+
|
1224
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1225
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1226
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1227
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1228
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1229
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1230
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1149
1231
|
|
1150
1232
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
1151
1233
|
|
@@ -1335,7 +1417,33 @@ void ggml_vk_instance_init() {
|
|
1335
1417
|
vk_instance.device_indices.push_back(tmp);
|
1336
1418
|
}
|
1337
1419
|
} else {
|
1338
|
-
vk_instance.
|
1420
|
+
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
1421
|
+
|
1422
|
+
// Make sure at least one device exists
|
1423
|
+
if (devices.empty()) {
|
1424
|
+
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
|
1425
|
+
GGML_ASSERT(false);
|
1426
|
+
}
|
1427
|
+
|
1428
|
+
// Default to using all dedicated GPUs
|
1429
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
1430
|
+
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
1431
|
+
|
1432
|
+
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
1433
|
+
vk_instance.device_indices.push_back(i);
|
1434
|
+
}
|
1435
|
+
}
|
1436
|
+
|
1437
|
+
// If no dedicated GPUs found, fall back to GPU 0
|
1438
|
+
if (vk_instance.device_indices.empty()) {
|
1439
|
+
vk_instance.device_indices.push_back(0);
|
1440
|
+
}
|
1441
|
+
}
|
1442
|
+
|
1443
|
+
std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
|
1444
|
+
|
1445
|
+
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
1446
|
+
ggml_vk_print_gpu_info(i);
|
1339
1447
|
}
|
1340
1448
|
|
1341
1449
|
vk_instance_initialized = true;
|
@@ -1561,6 +1669,15 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
1561
1669
|
|
1562
1670
|
switch (src0_type) {
|
1563
1671
|
case GGML_TYPE_Q4_0:
|
1672
|
+
case GGML_TYPE_Q4_1:
|
1673
|
+
case GGML_TYPE_Q5_0:
|
1674
|
+
case GGML_TYPE_Q5_1:
|
1675
|
+
case GGML_TYPE_Q8_0:
|
1676
|
+
case GGML_TYPE_Q2_K:
|
1677
|
+
case GGML_TYPE_Q3_K:
|
1678
|
+
case GGML_TYPE_Q4_K:
|
1679
|
+
case GGML_TYPE_Q5_K:
|
1680
|
+
case GGML_TYPE_Q6_K:
|
1564
1681
|
break;
|
1565
1682
|
default:
|
1566
1683
|
return nullptr;
|
@@ -2028,7 +2145,6 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
2028
2145
|
ggml_vk_submit(subctx, ctx->fence);
|
2029
2146
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
2030
2147
|
ctx->device->device.resetFences({ ctx->fence });
|
2031
|
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
2032
2148
|
}
|
2033
2149
|
}
|
2034
2150
|
|
@@ -2125,7 +2241,6 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
|
|
2125
2241
|
for (auto& cpy : subctx->out_memcpys) {
|
2126
2242
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
2127
2243
|
}
|
2128
|
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
2129
2244
|
}
|
2130
2245
|
}
|
2131
2246
|
|
@@ -2292,6 +2407,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2292
2407
|
return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
|
2293
2408
|
case VK_VENDOR_ID_INTEL:
|
2294
2409
|
return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
|
2410
|
+
default:
|
2411
|
+
break;
|
2295
2412
|
}
|
2296
2413
|
|
2297
2414
|
if (m <= 32 || n <= 32) {
|
@@ -2417,11 +2534,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2417
2534
|
src1_uma = d_Qy != nullptr;
|
2418
2535
|
}
|
2419
2536
|
|
2420
|
-
const bool
|
2421
|
-
const bool
|
2422
|
-
|
2423
|
-
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2424
|
-
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
2537
|
+
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
2538
|
+
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
2425
2539
|
|
2426
2540
|
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
|
2427
2541
|
|
@@ -2463,16 +2577,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2463
2577
|
uint64_t x_buf_offset = 0;
|
2464
2578
|
vk_buffer d_Y;
|
2465
2579
|
uint64_t y_buf_offset = 0;
|
2466
|
-
if (
|
2467
|
-
d_Qx = ctx->prealloc_qx;
|
2468
|
-
} else if (!src0_uma) {
|
2580
|
+
if (!src0_uma) {
|
2469
2581
|
d_Qx = extra_src0->buffer_gpu.lock();
|
2470
2582
|
qx_buf_offset = extra_src0->offset;
|
2471
2583
|
GGML_ASSERT(d_Qx != nullptr);
|
2472
2584
|
}
|
2473
|
-
if (
|
2474
|
-
d_Qy = ctx->prealloc_qy;
|
2475
|
-
} else if (!src1_uma) {
|
2585
|
+
if (!src1_uma) {
|
2476
2586
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2477
2587
|
qy_buf_offset = extra_src1->offset;
|
2478
2588
|
GGML_ASSERT(d_Qy != nullptr);
|
@@ -2524,33 +2634,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2524
2634
|
|
2525
2635
|
if (x_non_contig) {
|
2526
2636
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
2527
|
-
} else if (
|
2528
|
-
|
2529
|
-
|
2530
|
-
|
2531
|
-
ctx->staging_offset = qx_sz * ne02 * ne03;
|
2532
|
-
}
|
2533
|
-
|
2534
|
-
if (qx_needs_dequant) {
|
2535
|
-
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
2536
|
-
ggml_vk_sync_buffers(subctx);
|
2537
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
2538
|
-
}
|
2637
|
+
} else if (qx_needs_dequant) {
|
2638
|
+
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
2639
|
+
ggml_vk_sync_buffers(subctx);
|
2640
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
2539
2641
|
}
|
2540
2642
|
if (y_non_contig) {
|
2541
2643
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
2542
|
-
} else if (load_y) {
|
2543
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
2544
2644
|
}
|
2545
2645
|
|
2546
2646
|
uint32_t stride_batch_x = ne00*ne01;
|
2547
2647
|
uint32_t stride_batch_y = ne10*ne11;
|
2548
2648
|
|
2549
|
-
if (!ggml_vk_dim01_contiguous(src0) && !
|
2649
|
+
if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
|
2550
2650
|
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
2551
2651
|
}
|
2552
2652
|
|
2553
|
-
if (!ggml_vk_dim01_contiguous(src1) && !
|
2653
|
+
if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
|
2554
2654
|
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
2555
2655
|
}
|
2556
2656
|
|
@@ -2610,11 +2710,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2610
2710
|
src1_uma = d_Qy != nullptr;
|
2611
2711
|
}
|
2612
2712
|
|
2613
|
-
const bool
|
2614
|
-
const bool
|
2615
|
-
|
2616
|
-
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2617
|
-
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
2713
|
+
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
2714
|
+
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
2618
2715
|
|
2619
2716
|
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
2620
2717
|
|
@@ -2638,16 +2735,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2638
2735
|
uint64_t x_buf_offset = 0;
|
2639
2736
|
vk_buffer d_Y;
|
2640
2737
|
uint64_t y_buf_offset = 0;
|
2641
|
-
if
|
2642
|
-
d_Qx = ctx->prealloc_qx;
|
2643
|
-
} else if(!src1_uma) {
|
2738
|
+
if(!src0_uma) {
|
2644
2739
|
d_Qx = extra_src0->buffer_gpu.lock();
|
2645
2740
|
qx_buf_offset = extra_src0->offset;
|
2646
2741
|
GGML_ASSERT(d_Qx != nullptr);
|
2647
2742
|
}
|
2648
|
-
if
|
2649
|
-
d_Qy = ctx->prealloc_qy;
|
2650
|
-
} else if(!src1_uma) {
|
2743
|
+
if(!src1_uma) {
|
2651
2744
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2652
2745
|
qy_buf_offset = extra_src1->offset;
|
2653
2746
|
GGML_ASSERT(d_Qy != nullptr);
|
@@ -2694,15 +2787,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2694
2787
|
if (x_non_contig) {
|
2695
2788
|
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
2696
2789
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
2697
|
-
} else if (load_x) {
|
2698
|
-
// copy data to device
|
2699
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
|
2700
2790
|
}
|
2701
2791
|
if (y_non_contig) {
|
2702
2792
|
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
2703
2793
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
2704
|
-
} else if (load_y) {
|
2705
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
2706
2794
|
}
|
2707
2795
|
|
2708
2796
|
for (uint64_t i13 = 0; i13 < ne13; i13++) {
|
@@ -2783,8 +2871,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2783
2871
|
src1_uma = d_Qy != nullptr;
|
2784
2872
|
}
|
2785
2873
|
|
2786
|
-
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2787
|
-
|
2788
2874
|
const uint64_t x_ne = ne00 * ne01 * ne02;
|
2789
2875
|
const uint64_t y_ne = ne10 * ne11 * ne12;
|
2790
2876
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
@@ -2799,9 +2885,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2799
2885
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
2800
2886
|
const uint64_t qx_buf_offset = extra_src0->offset;
|
2801
2887
|
GGML_ASSERT(d_Qx != nullptr);
|
2802
|
-
if (
|
2803
|
-
d_Qy = ctx->prealloc_qy;
|
2804
|
-
} else if (!src1_uma) {
|
2888
|
+
if (!src1_uma) {
|
2805
2889
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2806
2890
|
qy_buf_offset = extra_src1->offset;
|
2807
2891
|
GGML_ASSERT(d_Qx != nullptr);
|
@@ -2816,10 +2900,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2816
2900
|
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2817
2901
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
2818
2902
|
|
2819
|
-
if (load_y) {
|
2820
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
2821
|
-
}
|
2822
|
-
|
2823
2903
|
// compute
|
2824
2904
|
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
2825
2905
|
ggml_vk_sync_buffers(subctx);
|
@@ -2875,8 +2955,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2875
2955
|
src1_uma = d_Qy != nullptr;
|
2876
2956
|
}
|
2877
2957
|
|
2878
|
-
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2879
|
-
|
2880
2958
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
2881
2959
|
|
2882
2960
|
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
|
@@ -2892,9 +2970,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2892
2970
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
2893
2971
|
const uint64_t qx_buf_offset = extra_src0->offset;
|
2894
2972
|
GGML_ASSERT(d_Qx != nullptr);
|
2895
|
-
if (
|
2896
|
-
d_Qy = ctx->prealloc_qy;
|
2897
|
-
} else {
|
2973
|
+
if (!src1_uma) {
|
2898
2974
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2899
2975
|
qy_buf_offset = extra_src1->offset;
|
2900
2976
|
GGML_ASSERT(d_Qx != nullptr);
|
@@ -2909,10 +2985,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2909
2985
|
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2910
2986
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
2911
2987
|
|
2912
|
-
if (load_y) {
|
2913
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
2914
|
-
}
|
2915
|
-
|
2916
2988
|
// compute
|
2917
2989
|
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
2918
2990
|
ggml_vk_sync_buffers(subctx);
|
@@ -3168,7 +3240,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3168
3240
|
}
|
3169
3241
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
3170
3242
|
#endif
|
3171
|
-
GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT
|
3243
|
+
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
3172
3244
|
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
3173
3245
|
GGML_ASSERT(dst->extra != nullptr);
|
3174
3246
|
const uint64_t ne00 = src0->ne[0];
|
@@ -3236,11 +3308,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3236
3308
|
}
|
3237
3309
|
}
|
3238
3310
|
|
3239
|
-
|
3240
|
-
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
3241
|
-
const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma;
|
3242
|
-
|
3243
|
-
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
3311
|
+
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
3244
3312
|
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
3245
3313
|
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
3246
3314
|
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
@@ -3255,55 +3323,43 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3255
3323
|
GGML_ASSERT(d_D != nullptr);
|
3256
3324
|
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
3257
3325
|
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
3258
|
-
if
|
3259
|
-
d_X = ctx->prealloc_qx;
|
3260
|
-
} else if(!src0_uma) {
|
3326
|
+
if(!src0_uma) {
|
3261
3327
|
d_X = extra_src0->buffer_gpu.lock();
|
3262
3328
|
x_buf_offset = extra_src0->offset;
|
3263
3329
|
GGML_ASSERT(d_X != nullptr);
|
3264
3330
|
}
|
3265
|
-
if (
|
3266
|
-
d_Y = ctx->prealloc_qy;
|
3267
|
-
} else if (use_src1 && !src1_uma) {
|
3331
|
+
if (use_src1 && !src1_uma) {
|
3268
3332
|
d_Y = extra_src1->buffer_gpu.lock();
|
3269
3333
|
y_buf_offset = extra_src1->offset;
|
3270
3334
|
GGML_ASSERT(d_Y != nullptr);
|
3271
3335
|
}
|
3272
3336
|
|
3273
|
-
GGML_ASSERT(!transfer_src2);
|
3274
3337
|
if (use_src2 && !src2_uma) {
|
3275
3338
|
d_Z = extra_src2->buffer_gpu.lock();
|
3276
3339
|
z_buf_offset = extra_src2->offset;
|
3277
3340
|
GGML_ASSERT(d_Z != nullptr);
|
3278
3341
|
}
|
3279
3342
|
|
3280
|
-
if (op == GGML_OP_CPY) {
|
3281
|
-
GGML_ASSERT(!transfer_src0);
|
3282
|
-
GGML_ASSERT(!transfer_src1);
|
3343
|
+
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
|
3283
3344
|
x_sz = ggml_nbytes(src0);
|
3345
|
+
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
3284
3346
|
d_sz = ggml_nbytes(dst);
|
3285
3347
|
|
3286
|
-
if (
|
3348
|
+
if (x_buf_offset + x_sz >= d_X->size) {
|
3287
3349
|
x_sz = VK_WHOLE_SIZE;
|
3288
3350
|
}
|
3289
|
-
if (
|
3351
|
+
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
3352
|
+
y_sz = VK_WHOLE_SIZE;
|
3353
|
+
}
|
3354
|
+
if (d_buf_offset + d_sz >= d_D->size) {
|
3290
3355
|
d_sz = VK_WHOLE_SIZE;
|
3291
3356
|
}
|
3292
3357
|
}
|
3293
3358
|
|
3294
3359
|
std::array<uint32_t, 3> elements;
|
3295
3360
|
|
3296
|
-
// copy src0 to device
|
3297
|
-
if (transfer_src0) {
|
3298
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
|
3299
|
-
ctx->staging_offset = x_sz * ne02 * ne03;
|
3300
|
-
}
|
3301
|
-
if (transfer_src1) {
|
3302
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
|
3303
|
-
}
|
3304
|
-
|
3305
3361
|
// Single call if dimension 2 is contiguous
|
3306
|
-
if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
3362
|
+
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
3307
3363
|
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
3308
3364
|
|
3309
3365
|
switch (dst->op) {
|
@@ -3316,16 +3372,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3316
3372
|
case GGML_OP_ROPE:
|
3317
3373
|
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
3318
3374
|
break;
|
3375
|
+
case GGML_OP_GET_ROWS:
|
3376
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
3377
|
+
break;
|
3319
3378
|
default:
|
3320
3379
|
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
3321
3380
|
break;
|
3322
3381
|
}
|
3323
3382
|
|
3324
|
-
if (op != GGML_OP_CPY) {
|
3383
|
+
if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
|
3325
3384
|
if (x_sz != VK_WHOLE_SIZE) {
|
3326
3385
|
x_sz *= ne02 * ne03;
|
3327
3386
|
}
|
3328
|
-
if (y_sz != VK_WHOLE_SIZE) {
|
3387
|
+
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
3329
3388
|
y_sz *= ne12 * ne13;
|
3330
3389
|
}
|
3331
3390
|
if (d_sz != VK_WHOLE_SIZE) {
|
@@ -3380,6 +3439,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3380
3439
|
case GGML_OP_ROPE:
|
3381
3440
|
elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
|
3382
3441
|
break;
|
3442
|
+
case GGML_OP_GET_ROWS:
|
3443
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
3444
|
+
break;
|
3383
3445
|
default:
|
3384
3446
|
elements = { (uint32_t)ne0, 1, 1 };
|
3385
3447
|
break;
|
@@ -3414,7 +3476,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3414
3476
|
}
|
3415
3477
|
|
3416
3478
|
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3417
|
-
|
3479
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
3480
|
+
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
3481
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
3482
|
+
|
3483
|
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
3484
|
+
(uint32_t)ggml_nelements(src0),
|
3485
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
3486
|
+
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
3487
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
3488
|
+
0,
|
3489
|
+
0.0f, 0.0f,
|
3490
|
+
});
|
3418
3491
|
}
|
3419
3492
|
|
3420
3493
|
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3570,9 +3643,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
3570
3643
|
if (is_neox) {
|
3571
3644
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
3572
3645
|
const float inv_ndims = -1.0f / n_dims;
|
3573
|
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
|
3646
|
+
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
|
3574
3647
|
} else {
|
3575
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
|
3648
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
|
3576
3649
|
}
|
3577
3650
|
}
|
3578
3651
|
|
@@ -3581,16 +3654,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
3581
3654
|
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
|
3582
3655
|
}
|
3583
3656
|
|
3584
|
-
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3585
|
-
// If backend is CPU, data from src0 has to be copied off the device
|
3586
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3587
|
-
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
3588
|
-
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
3589
|
-
ggml_vk_sync_buffers(subctx);
|
3590
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size);
|
3591
|
-
}
|
3592
|
-
}
|
3593
|
-
|
3594
3657
|
#ifdef GGML_VULKAN_RUN_TESTS
|
3595
3658
|
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
|
3596
3659
|
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
|
@@ -3613,6 +3676,8 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
|
|
3613
3676
|
val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
|
3614
3677
|
} else if (type == GGML_TYPE_F16) {
|
3615
3678
|
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
|
3679
|
+
} else {
|
3680
|
+
GGML_ASSERT(false);
|
3616
3681
|
}
|
3617
3682
|
fprintf(stderr, "% 7.2f ", val);
|
3618
3683
|
} else {
|
@@ -3914,6 +3979,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
|
3914
3979
|
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
3915
3980
|
} else if (tensor->type == GGML_TYPE_F16) {
|
3916
3981
|
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
3982
|
+
} else {
|
3983
|
+
GGML_ASSERT(false);
|
3917
3984
|
}
|
3918
3985
|
fprintf(stderr, "% 7.2f ", val);
|
3919
3986
|
} else {
|
@@ -4329,7 +4396,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
4329
4396
|
|
4330
4397
|
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
|
4331
4398
|
|
4332
|
-
if (avg_err > 0.
|
4399
|
+
if (avg_err > 0.01 || std::isnan(avg_err)) {
|
4333
4400
|
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
4334
4401
|
std::cerr << "Actual result: " << std::endl << std::endl;
|
4335
4402
|
ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
@@ -4379,27 +4446,15 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|
4379
4446
|
return extra;
|
4380
4447
|
}
|
4381
4448
|
|
4382
|
-
static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) {
|
4383
|
-
return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID;
|
4384
|
-
}
|
4385
|
-
|
4386
4449
|
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
4387
4450
|
#ifdef GGML_VULKAN_DEBUG
|
4388
4451
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
4389
4452
|
#endif
|
4390
|
-
|
4391
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4392
|
-
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
4393
|
-
|
4394
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
|
4453
|
+
if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
|
4395
4454
|
return;
|
4396
4455
|
}
|
4397
4456
|
|
4398
4457
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
4399
|
-
if (extra == nullptr) {
|
4400
|
-
// Workaround for CPU backend BLAS matmul calls
|
4401
|
-
extra = ggml_vk_tensor_create_extra(node);
|
4402
|
-
}
|
4403
4458
|
|
4404
4459
|
ggml_tensor * src0 = node->src[0];
|
4405
4460
|
ggml_tensor * src1 = node->src[1];
|
@@ -4419,7 +4474,18 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4419
4474
|
const int64_t ne22 = node->ne[2];
|
4420
4475
|
const int64_t ne23 = node->ne[3];
|
4421
4476
|
|
4422
|
-
const
|
4477
|
+
const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
|
4478
|
+
const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
|
4479
|
+
|
4480
|
+
const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
|
4481
|
+
const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
|
4482
|
+
|
4483
|
+
const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
|
4484
|
+
|
4485
|
+
bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
4486
|
+
|
4487
|
+
const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
|
4488
|
+
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
4423
4489
|
|
4424
4490
|
int split_k;
|
4425
4491
|
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
@@ -4431,10 +4497,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4431
4497
|
const uint32_t y_ne = ne10 * ne11;
|
4432
4498
|
const uint32_t d_ne = ne20 * ne21;
|
4433
4499
|
|
4434
|
-
const uint64_t
|
4435
|
-
const uint64_t
|
4436
|
-
const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
4437
|
-
const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
4500
|
+
const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
4501
|
+
const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
4438
4502
|
uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
|
4439
4503
|
const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
|
4440
4504
|
|
@@ -4477,12 +4541,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4477
4541
|
break;
|
4478
4542
|
case GGML_OP_MUL_MAT:
|
4479
4543
|
case GGML_OP_MUL_MAT_ID:
|
4480
|
-
if (ctx->prealloc_size_qx < qx_sz) {
|
4481
|
-
ctx->prealloc_size_qx = qx_sz;
|
4482
|
-
}
|
4483
|
-
if (ctx->prealloc_size_qy < qy_sz) {
|
4484
|
-
ctx->prealloc_size_qy = qy_sz;
|
4485
|
-
}
|
4486
4544
|
if (ctx->prealloc_size_x < x_sz) {
|
4487
4545
|
ctx->prealloc_size_x = x_sz;
|
4488
4546
|
}
|
@@ -4506,7 +4564,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4506
4564
|
return;
|
4507
4565
|
}
|
4508
4566
|
#ifdef GGML_VULKAN_DEBUG
|
4509
|
-
std::cerr << "ggml_vk_preallocate_buffers(
|
4567
|
+
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
4510
4568
|
#endif
|
4511
4569
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
4512
4570
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
@@ -4569,6 +4627,41 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4569
4627
|
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
4570
4628
|
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
4571
4629
|
|
4630
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
|
4631
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
|
4632
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
|
4633
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
|
4634
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
|
4635
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
|
4636
|
+
|
4637
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
|
4638
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
|
4639
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
|
4640
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
|
4641
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
|
4642
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
|
4643
|
+
|
4644
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
|
4645
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
|
4646
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
|
4647
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
|
4648
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
|
4649
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
|
4650
|
+
|
4651
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
|
4652
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
|
4653
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
|
4654
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
|
4655
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
|
4656
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
|
4657
|
+
|
4658
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
|
4659
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
|
4660
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
|
4661
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
|
4662
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
|
4663
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
|
4664
|
+
|
4572
4665
|
std::cerr << std::endl;
|
4573
4666
|
|
4574
4667
|
const std::vector<size_t> vals {
|
@@ -4608,20 +4701,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4608
4701
|
GGML_ASSERT(false);
|
4609
4702
|
#endif
|
4610
4703
|
|
4611
|
-
if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) {
|
4612
|
-
// Resize buffer
|
4613
|
-
if (ctx->prealloc_qx != nullptr) {
|
4614
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qx);
|
4615
|
-
}
|
4616
|
-
ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx);
|
4617
|
-
}
|
4618
|
-
if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) {
|
4619
|
-
// Resize buffer
|
4620
|
-
if (ctx->prealloc_qy != nullptr) {
|
4621
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qy);
|
4622
|
-
}
|
4623
|
-
ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy);
|
4624
|
-
}
|
4625
4704
|
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
4626
4705
|
// Resize buffer
|
4627
4706
|
if (ctx->prealloc_x != nullptr) {
|
@@ -4655,11 +4734,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4655
4734
|
}
|
4656
4735
|
|
4657
4736
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
4658
|
-
|
4659
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4660
|
-
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4661
|
-
|
4662
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
4737
|
+
if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
|
4663
4738
|
return;
|
4664
4739
|
}
|
4665
4740
|
|
@@ -4687,7 +4762,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4687
4762
|
}
|
4688
4763
|
break;
|
4689
4764
|
case GGML_OP_REPEAT:
|
4690
|
-
|
4765
|
+
case GGML_OP_GET_ROWS:
|
4691
4766
|
case GGML_OP_ADD:
|
4692
4767
|
case GGML_OP_MUL:
|
4693
4768
|
case GGML_OP_SCALE:
|
@@ -4711,10 +4786,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4711
4786
|
case GGML_OP_ARGSORT:
|
4712
4787
|
break;
|
4713
4788
|
default:
|
4714
|
-
|
4715
|
-
|
4716
|
-
GGML_ASSERT(false);
|
4717
|
-
}
|
4789
|
+
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
4790
|
+
GGML_ASSERT(false);
|
4718
4791
|
return;
|
4719
4792
|
}
|
4720
4793
|
|
@@ -4763,8 +4836,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4763
4836
|
case GGML_OP_PERMUTE:
|
4764
4837
|
case GGML_OP_TRANSPOSE:
|
4765
4838
|
case GGML_OP_NONE:
|
4766
|
-
ggml_vk_nop(ctx, ctx->compute_ctx, src0, node);
|
4767
|
-
|
4768
4839
|
break;
|
4769
4840
|
case GGML_OP_NORM:
|
4770
4841
|
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
@@ -4831,11 +4902,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4831
4902
|
}
|
4832
4903
|
|
4833
4904
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
4834
|
-
|
4835
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4836
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4837
|
-
|
4838
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
|
4905
|
+
if (ctx->disable) {
|
4839
4906
|
return false;
|
4840
4907
|
}
|
4841
4908
|
|
@@ -4878,10 +4945,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
4878
4945
|
break;
|
4879
4946
|
case GGML_OP_MUL_MAT:
|
4880
4947
|
case GGML_OP_MUL_MAT_ID:
|
4881
|
-
if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
4882
|
-
return false;
|
4883
|
-
}
|
4884
|
-
|
4885
4948
|
extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
4886
4949
|
|
4887
4950
|
break;
|
@@ -4995,8 +5058,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
4995
5058
|
#endif
|
4996
5059
|
ggml_vk_graph_cleanup(ctx);
|
4997
5060
|
|
4998
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qx);
|
4999
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qy);
|
5000
5061
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
5001
5062
|
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
5002
5063
|
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
@@ -5007,8 +5068,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
5007
5068
|
ggml_vk_destroy_buffer(buffer);
|
5008
5069
|
}
|
5009
5070
|
|
5010
|
-
ctx->prealloc_size_qx = 0;
|
5011
|
-
ctx->prealloc_size_qy = 0;
|
5012
5071
|
ctx->prealloc_size_x = 0;
|
5013
5072
|
ctx->prealloc_size_y = 0;
|
5014
5073
|
ctx->prealloc_size_split_k = 0;
|
@@ -5039,80 +5098,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
|
|
5039
5098
|
snprintf(description, description_size, "%s", props.deviceName.data());
|
5040
5099
|
}
|
5041
5100
|
|
5042
|
-
// CPU assist interface
|
5043
|
-
|
5044
|
-
void ggml_vk_init_cpu_assist() {
|
5045
|
-
ggml_vk_instance_init();
|
5046
|
-
|
5047
|
-
std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
|
5048
|
-
|
5049
|
-
for (int i = 0; i < ggml_vk_get_device_count(); i++) {
|
5050
|
-
ggml_vk_print_gpu_info(i);
|
5051
|
-
}
|
5052
|
-
// Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
|
5053
|
-
ggml_backend_vk_init(0);
|
5054
|
-
}
|
5055
|
-
|
5056
|
-
void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
|
5057
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5058
|
-
|
5059
|
-
if (!ctx->initialized) {
|
5060
|
-
return;
|
5061
|
-
}
|
5062
|
-
|
5063
|
-
ggml_vk_preallocate_buffers_graph(ctx, node);
|
5064
|
-
}
|
5065
|
-
|
5066
|
-
void ggml_vk_preallocate_buffers_cpu_assist() {
|
5067
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5068
|
-
|
5069
|
-
if (!ctx->initialized) {
|
5070
|
-
return;
|
5071
|
-
}
|
5072
|
-
|
5073
|
-
ggml_vk_preallocate_buffers(ctx);
|
5074
|
-
}
|
5075
|
-
|
5076
|
-
void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
|
5077
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5078
|
-
|
5079
|
-
if (!ctx->initialized) {
|
5080
|
-
return;
|
5081
|
-
}
|
5082
|
-
|
5083
|
-
ggml_vk_build_graph(ctx, node, last_node);
|
5084
|
-
}
|
5085
|
-
|
5086
|
-
bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
|
5087
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5088
|
-
|
5089
|
-
if (!ctx->initialized) {
|
5090
|
-
return false;
|
5091
|
-
}
|
5092
|
-
|
5093
|
-
return ggml_vk_compute_forward(ctx, params, tensor);
|
5094
|
-
}
|
5095
|
-
|
5096
|
-
void ggml_vk_graph_cleanup_cpu_assist() {
|
5097
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5098
|
-
|
5099
|
-
if (!ctx->initialized) {
|
5100
|
-
return;
|
5101
|
-
}
|
5102
|
-
|
5103
|
-
ggml_vk_graph_cleanup(ctx);
|
5104
|
-
}
|
5105
|
-
|
5106
|
-
void ggml_vk_free_cpu_assist() {
|
5107
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5108
|
-
|
5109
|
-
if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
|
5110
|
-
return;
|
5111
|
-
}
|
5112
|
-
|
5113
|
-
ggml_backend_vk_free(vk_instance.backends[0]);
|
5114
|
-
}
|
5115
|
-
|
5116
5101
|
// backend interface
|
5117
5102
|
|
5118
5103
|
#define UNUSED GGML_UNUSED
|
@@ -5324,16 +5309,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
5324
5309
|
/* .is_host = */ NULL,
|
5325
5310
|
};
|
5326
5311
|
|
5327
|
-
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t
|
5312
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
5328
5313
|
#ifdef GGML_VULKAN_DEBUG
|
5329
|
-
std::cerr << "ggml_backend_vk_buffer_type(" <<
|
5314
|
+
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
5330
5315
|
#endif
|
5331
5316
|
|
5332
|
-
GGML_ASSERT(
|
5317
|
+
GGML_ASSERT(dev_num < vk_instance.device_indices.size());
|
5333
5318
|
|
5334
|
-
ggml_backend_vk_init(
|
5319
|
+
ggml_backend_vk_init(dev_num);
|
5335
5320
|
|
5336
|
-
return &vk_instance.buffer_types[
|
5321
|
+
return &vk_instance.buffer_types[dev_num];
|
5337
5322
|
}
|
5338
5323
|
|
5339
5324
|
// host buffer type
|
@@ -5502,7 +5487,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
5502
5487
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
5503
5488
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
5504
5489
|
|
5505
|
-
ggml_vk_buffer_copy_async(ctx->transfer_ctx,
|
5490
|
+
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
5506
5491
|
return true;
|
5507
5492
|
}
|
5508
5493
|
|
@@ -5536,6 +5521,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
5536
5521
|
}
|
5537
5522
|
|
5538
5523
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
5524
|
+
#ifdef GGML_VULKAN_DEBUG
|
5525
|
+
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
5526
|
+
#endif
|
5539
5527
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
5540
5528
|
|
5541
5529
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
@@ -5560,7 +5548,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
5560
5548
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
5561
5549
|
ggml_tensor * node = cgraph->nodes[i];
|
5562
5550
|
|
5563
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
5551
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
5564
5552
|
continue;
|
5565
5553
|
}
|
5566
5554
|
|
@@ -5596,8 +5584,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5596
5584
|
}
|
5597
5585
|
break;
|
5598
5586
|
case GGML_OP_MUL_MAT:
|
5599
|
-
case GGML_OP_MUL_MAT_ID:
|
5587
|
+
// case GGML_OP_MUL_MAT_ID:
|
5600
5588
|
{
|
5589
|
+
switch (op->src[0]->type) {
|
5590
|
+
case GGML_TYPE_F32:
|
5591
|
+
case GGML_TYPE_F16:
|
5592
|
+
case GGML_TYPE_Q4_0:
|
5593
|
+
case GGML_TYPE_Q4_1:
|
5594
|
+
case GGML_TYPE_Q5_0:
|
5595
|
+
case GGML_TYPE_Q5_1:
|
5596
|
+
case GGML_TYPE_Q8_0:
|
5597
|
+
case GGML_TYPE_Q2_K:
|
5598
|
+
case GGML_TYPE_Q3_K:
|
5599
|
+
case GGML_TYPE_Q4_K:
|
5600
|
+
case GGML_TYPE_Q5_K:
|
5601
|
+
case GGML_TYPE_Q6_K:
|
5602
|
+
break;
|
5603
|
+
default:
|
5604
|
+
return false;
|
5605
|
+
}
|
5601
5606
|
struct ggml_tensor * a;
|
5602
5607
|
struct ggml_tensor * b;
|
5603
5608
|
if (op->op == GGML_OP_MUL_MAT) {
|
@@ -5612,25 +5617,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5612
5617
|
}
|
5613
5618
|
return true;
|
5614
5619
|
} break;
|
5615
|
-
|
5616
|
-
|
5617
|
-
|
5618
|
-
|
5619
|
-
|
5620
|
-
|
5621
|
-
|
5622
|
-
|
5623
|
-
|
5624
|
-
|
5625
|
-
|
5626
|
-
|
5627
|
-
|
5628
|
-
|
5629
|
-
|
5620
|
+
case GGML_OP_GET_ROWS:
|
5621
|
+
{
|
5622
|
+
switch (op->src[0]->type) {
|
5623
|
+
case GGML_TYPE_F32:
|
5624
|
+
case GGML_TYPE_F16:
|
5625
|
+
case GGML_TYPE_Q4_0:
|
5626
|
+
case GGML_TYPE_Q4_1:
|
5627
|
+
case GGML_TYPE_Q5_0:
|
5628
|
+
case GGML_TYPE_Q5_1:
|
5629
|
+
case GGML_TYPE_Q8_0:
|
5630
|
+
return true;
|
5631
|
+
default:
|
5632
|
+
return false;
|
5633
|
+
}
|
5634
|
+
} break;
|
5630
5635
|
case GGML_OP_CPY:
|
5636
|
+
case GGML_OP_DUP:
|
5631
5637
|
{
|
5632
5638
|
ggml_type src0_type = op->src[0]->type;
|
5633
|
-
ggml_type src1_type = op->src[1]->type;
|
5639
|
+
ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
|
5634
5640
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
5635
5641
|
return true;
|
5636
5642
|
}
|
@@ -5642,7 +5648,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5642
5648
|
}
|
5643
5649
|
return false;
|
5644
5650
|
} break;
|
5645
|
-
case GGML_OP_DUP:
|
5646
5651
|
// case GGML_OP_REPEAT:
|
5647
5652
|
// {
|
5648
5653
|
// ggml_type src0_type = op->src[0]->type;
|
@@ -5679,6 +5684,20 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5679
5684
|
UNUSED(backend);
|
5680
5685
|
}
|
5681
5686
|
|
5687
|
+
GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
5688
|
+
const ggml_tensor * dst = op;
|
5689
|
+
|
5690
|
+
const int min_batch_size = 32;
|
5691
|
+
|
5692
|
+
if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
|
5693
|
+
return true;
|
5694
|
+
}
|
5695
|
+
|
5696
|
+
return false;
|
5697
|
+
|
5698
|
+
UNUSED(backend);
|
5699
|
+
}
|
5700
|
+
|
5682
5701
|
// TODO: enable async and synchronize
|
5683
5702
|
static ggml_backend_i ggml_backend_vk_interface = {
|
5684
5703
|
/* .get_name = */ ggml_backend_vk_name,
|
@@ -5693,6 +5712,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
5693
5712
|
/* .graph_plan_compute = */ NULL,
|
5694
5713
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
5695
5714
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
5715
|
+
/* .offload_op = */ ggml_backend_vk_offload_op,
|
5696
5716
|
/* .event_new = */ NULL,
|
5697
5717
|
/* .event_free = */ NULL,
|
5698
5718
|
/* .event_record = */ NULL,
|
@@ -5705,22 +5725,22 @@ static ggml_guid_t ggml_backend_vk_guid() {
|
|
5705
5725
|
return &guid;
|
5706
5726
|
}
|
5707
5727
|
|
5708
|
-
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t
|
5709
|
-
if (vk_instance.initialized[
|
5710
|
-
return vk_instance.backends[
|
5728
|
+
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
5729
|
+
if (vk_instance.initialized[dev_num]) {
|
5730
|
+
return vk_instance.backends[dev_num];
|
5711
5731
|
}
|
5712
5732
|
#ifdef GGML_VULKAN_DEBUG
|
5713
|
-
std::cerr << "ggml_backend_vk_init(" <<
|
5733
|
+
std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
|
5714
5734
|
#endif
|
5715
5735
|
|
5716
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[
|
5717
|
-
ggml_vk_init(ctx,
|
5718
|
-
ctx->name = GGML_VK_NAME + std::to_string(
|
5719
|
-
vk_instance.buffer_types[
|
5736
|
+
ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
|
5737
|
+
ggml_vk_init(ctx, dev_num);
|
5738
|
+
ctx->name = GGML_VK_NAME + std::to_string(dev_num);
|
5739
|
+
vk_instance.buffer_types[dev_num] = {
|
5720
5740
|
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
5721
5741
|
/* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
|
5722
5742
|
};
|
5723
|
-
vk_instance.initialized[
|
5743
|
+
vk_instance.initialized[dev_num] = true;
|
5724
5744
|
|
5725
5745
|
ggml_backend_t vk_backend = new ggml_backend {
|
5726
5746
|
/* .guid = */ ggml_backend_vk_guid(),
|
@@ -5728,7 +5748,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
5728
5748
|
/* .context = */ &vk_instance.contexts[ctx->idx],
|
5729
5749
|
};
|
5730
5750
|
|
5731
|
-
vk_instance.backends[
|
5751
|
+
vk_instance.backends[dev_num] = vk_backend;
|
5732
5752
|
|
5733
5753
|
return vk_backend;
|
5734
5754
|
}
|
@@ -5772,10 +5792,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo
|
|
5772
5792
|
extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
|
5773
5793
|
|
5774
5794
|
GGML_CALL int ggml_backend_vk_reg_devices() {
|
5775
|
-
|
5795
|
+
ggml_vk_instance_init();
|
5796
|
+
|
5797
|
+
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
5776
5798
|
char name[128];
|
5777
|
-
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME,
|
5778
|
-
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(
|
5799
|
+
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
|
5800
|
+
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
|
5779
5801
|
}
|
5780
5802
|
return vk_instance.device_indices.size();
|
5781
5803
|
}
|
@@ -5859,6 +5881,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
5859
5881
|
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
5860
5882
|
} else if (tensor->type == GGML_TYPE_F16) {
|
5861
5883
|
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
5884
|
+
} else {
|
5885
|
+
GGML_ASSERT(false);
|
5862
5886
|
}
|
5863
5887
|
fprintf(stderr, "% 7.2f ", val);
|
5864
5888
|
} else {
|
@@ -5953,6 +5977,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5953
5977
|
return;
|
5954
5978
|
}
|
5955
5979
|
|
5980
|
+
#ifdef GGML_VULKAN_DEBUG
|
5981
|
+
std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
|
5982
|
+
#endif
|
5983
|
+
|
5956
5984
|
ggml_tensor * src0 = tensor->src[0];
|
5957
5985
|
ggml_tensor * src1 = tensor->src[1];
|
5958
5986
|
ggml_tensor * src2 = tensor->src[2];
|
@@ -6212,6 +6240,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6212
6240
|
tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
|
6213
6241
|
} else if (tensor->op == GGML_OP_TRANSPOSE) {
|
6214
6242
|
tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
|
6243
|
+
} else if (tensor->op == GGML_OP_GET_ROWS) {
|
6244
|
+
tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
|
6215
6245
|
} else {
|
6216
6246
|
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
6217
6247
|
GGML_ASSERT(false);
|
@@ -6262,6 +6292,10 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6262
6292
|
return;
|
6263
6293
|
}
|
6264
6294
|
|
6295
|
+
#ifdef GGML_VULKAN_DEBUG
|
6296
|
+
std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
|
6297
|
+
#endif
|
6298
|
+
|
6265
6299
|
ggml_tensor * src0 = tensor->src[0];
|
6266
6300
|
ggml_tensor * src1 = tensor->src[1];
|
6267
6301
|
|
@@ -6405,10 +6439,4 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6405
6439
|
free(tensor_data);
|
6406
6440
|
}
|
6407
6441
|
}
|
6408
|
-
|
6409
|
-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
6410
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
6411
|
-
|
6412
|
-
ggml_vk_check_results_0(ctx, params, tensor);
|
6413
|
-
}
|
6414
6442
|
#endif
|