llama_cpp 0.14.2 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -9,7 +9,6 @@
|
|
9
9
|
#include <algorithm>
|
10
10
|
#include <cmath>
|
11
11
|
#include <iostream>
|
12
|
-
#include <iomanip>
|
13
12
|
#include <limits>
|
14
13
|
#include <tuple>
|
15
14
|
#include <vector>
|
@@ -340,8 +339,8 @@ struct ggml_backend_vk_context {
|
|
340
339
|
size_t semaphore_idx, event_idx;
|
341
340
|
ggml_vk_garbage_collector gc;
|
342
341
|
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
343
|
-
size_t
|
344
|
-
vk_buffer
|
342
|
+
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
|
343
|
+
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
|
345
344
|
vk::Fence fence;
|
346
345
|
vk_buffer staging;
|
347
346
|
size_t staging_size;
|
@@ -710,6 +709,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
|
710
709
|
}
|
711
710
|
}
|
712
711
|
|
712
|
+
// All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
|
713
|
+
// Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
|
714
|
+
if (compute_index >= 0) {
|
715
|
+
return compute_index;
|
716
|
+
}
|
717
|
+
|
713
718
|
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
|
714
719
|
|
715
720
|
for(auto &q_family : queue_family_props) {
|
@@ -803,7 +808,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|
803
808
|
|
804
809
|
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
805
810
|
#ifdef GGML_VULKAN_DEBUG
|
806
|
-
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
811
|
+
std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
807
812
|
#endif
|
808
813
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
809
814
|
|
@@ -992,6 +997,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
992
997
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
993
998
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
|
994
999
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
1000
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1001
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1002
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1003
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1004
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
995
1005
|
|
996
1006
|
if (device->fp16) {
|
997
1007
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
@@ -1049,6 +1059,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1049
1059
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1050
1060
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1051
1061
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1062
|
+
|
1063
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1064
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1065
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1066
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1067
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1068
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1069
|
+
|
1070
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1071
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1072
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1073
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1074
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1075
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1076
|
+
|
1077
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1078
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1079
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1080
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1081
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1082
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1083
|
+
|
1084
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1085
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1086
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1087
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1088
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1089
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1090
|
+
|
1091
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1092
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1093
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1094
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1095
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1096
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1052
1097
|
} else {
|
1053
1098
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
1054
1099
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
@@ -1105,6 +1150,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1105
1150
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1106
1151
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1107
1152
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1153
|
+
|
1154
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1155
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1156
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1157
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1158
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1159
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1160
|
+
|
1161
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1162
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1163
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1164
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1165
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1166
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1167
|
+
|
1168
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1169
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1170
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1171
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1172
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1173
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1174
|
+
|
1175
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1176
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1177
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1178
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1179
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1180
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1181
|
+
|
1182
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1183
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1184
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1185
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1186
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1187
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1108
1188
|
}
|
1109
1189
|
|
1110
1190
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
@@ -1133,19 +1213,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1133
1213
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1134
1214
|
|
1135
1215
|
// get_rows
|
1136
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1137
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1138
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1139
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1140
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1141
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1142
|
-
|
1143
|
-
|
1144
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1145
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1146
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1147
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1148
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1216
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1217
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1218
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1219
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1220
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1221
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1222
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1223
|
+
|
1224
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1225
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1226
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1227
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1228
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1229
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1230
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1149
1231
|
|
1150
1232
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
1151
1233
|
|
@@ -1335,7 +1417,33 @@ void ggml_vk_instance_init() {
|
|
1335
1417
|
vk_instance.device_indices.push_back(tmp);
|
1336
1418
|
}
|
1337
1419
|
} else {
|
1338
|
-
vk_instance.
|
1420
|
+
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
1421
|
+
|
1422
|
+
// Make sure at least one device exists
|
1423
|
+
if (devices.empty()) {
|
1424
|
+
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
|
1425
|
+
GGML_ASSERT(false);
|
1426
|
+
}
|
1427
|
+
|
1428
|
+
// Default to using all dedicated GPUs
|
1429
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
1430
|
+
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
1431
|
+
|
1432
|
+
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
1433
|
+
vk_instance.device_indices.push_back(i);
|
1434
|
+
}
|
1435
|
+
}
|
1436
|
+
|
1437
|
+
// If no dedicated GPUs found, fall back to GPU 0
|
1438
|
+
if (vk_instance.device_indices.empty()) {
|
1439
|
+
vk_instance.device_indices.push_back(0);
|
1440
|
+
}
|
1441
|
+
}
|
1442
|
+
|
1443
|
+
std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
|
1444
|
+
|
1445
|
+
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
1446
|
+
ggml_vk_print_gpu_info(i);
|
1339
1447
|
}
|
1340
1448
|
|
1341
1449
|
vk_instance_initialized = true;
|
@@ -1561,6 +1669,15 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
1561
1669
|
|
1562
1670
|
switch (src0_type) {
|
1563
1671
|
case GGML_TYPE_Q4_0:
|
1672
|
+
case GGML_TYPE_Q4_1:
|
1673
|
+
case GGML_TYPE_Q5_0:
|
1674
|
+
case GGML_TYPE_Q5_1:
|
1675
|
+
case GGML_TYPE_Q8_0:
|
1676
|
+
case GGML_TYPE_Q2_K:
|
1677
|
+
case GGML_TYPE_Q3_K:
|
1678
|
+
case GGML_TYPE_Q4_K:
|
1679
|
+
case GGML_TYPE_Q5_K:
|
1680
|
+
case GGML_TYPE_Q6_K:
|
1564
1681
|
break;
|
1565
1682
|
default:
|
1566
1683
|
return nullptr;
|
@@ -2028,7 +2145,6 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
2028
2145
|
ggml_vk_submit(subctx, ctx->fence);
|
2029
2146
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
2030
2147
|
ctx->device->device.resetFences({ ctx->fence });
|
2031
|
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
2032
2148
|
}
|
2033
2149
|
}
|
2034
2150
|
|
@@ -2125,7 +2241,6 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
|
|
2125
2241
|
for (auto& cpy : subctx->out_memcpys) {
|
2126
2242
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
2127
2243
|
}
|
2128
|
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
2129
2244
|
}
|
2130
2245
|
}
|
2131
2246
|
|
@@ -2292,6 +2407,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2292
2407
|
return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
|
2293
2408
|
case VK_VENDOR_ID_INTEL:
|
2294
2409
|
return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
|
2410
|
+
default:
|
2411
|
+
break;
|
2295
2412
|
}
|
2296
2413
|
|
2297
2414
|
if (m <= 32 || n <= 32) {
|
@@ -2417,11 +2534,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2417
2534
|
src1_uma = d_Qy != nullptr;
|
2418
2535
|
}
|
2419
2536
|
|
2420
|
-
const bool
|
2421
|
-
const bool
|
2422
|
-
|
2423
|
-
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2424
|
-
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
2537
|
+
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
2538
|
+
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
2425
2539
|
|
2426
2540
|
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
|
2427
2541
|
|
@@ -2463,16 +2577,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2463
2577
|
uint64_t x_buf_offset = 0;
|
2464
2578
|
vk_buffer d_Y;
|
2465
2579
|
uint64_t y_buf_offset = 0;
|
2466
|
-
if (
|
2467
|
-
d_Qx = ctx->prealloc_qx;
|
2468
|
-
} else if (!src0_uma) {
|
2580
|
+
if (!src0_uma) {
|
2469
2581
|
d_Qx = extra_src0->buffer_gpu.lock();
|
2470
2582
|
qx_buf_offset = extra_src0->offset;
|
2471
2583
|
GGML_ASSERT(d_Qx != nullptr);
|
2472
2584
|
}
|
2473
|
-
if (
|
2474
|
-
d_Qy = ctx->prealloc_qy;
|
2475
|
-
} else if (!src1_uma) {
|
2585
|
+
if (!src1_uma) {
|
2476
2586
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2477
2587
|
qy_buf_offset = extra_src1->offset;
|
2478
2588
|
GGML_ASSERT(d_Qy != nullptr);
|
@@ -2524,33 +2634,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2524
2634
|
|
2525
2635
|
if (x_non_contig) {
|
2526
2636
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
2527
|
-
} else if (
|
2528
|
-
|
2529
|
-
|
2530
|
-
|
2531
|
-
ctx->staging_offset = qx_sz * ne02 * ne03;
|
2532
|
-
}
|
2533
|
-
|
2534
|
-
if (qx_needs_dequant) {
|
2535
|
-
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
2536
|
-
ggml_vk_sync_buffers(subctx);
|
2537
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
2538
|
-
}
|
2637
|
+
} else if (qx_needs_dequant) {
|
2638
|
+
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
2639
|
+
ggml_vk_sync_buffers(subctx);
|
2640
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
2539
2641
|
}
|
2540
2642
|
if (y_non_contig) {
|
2541
2643
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
2542
|
-
} else if (load_y) {
|
2543
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
2544
2644
|
}
|
2545
2645
|
|
2546
2646
|
uint32_t stride_batch_x = ne00*ne01;
|
2547
2647
|
uint32_t stride_batch_y = ne10*ne11;
|
2548
2648
|
|
2549
|
-
if (!ggml_vk_dim01_contiguous(src0) && !
|
2649
|
+
if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
|
2550
2650
|
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
2551
2651
|
}
|
2552
2652
|
|
2553
|
-
if (!ggml_vk_dim01_contiguous(src1) && !
|
2653
|
+
if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
|
2554
2654
|
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
2555
2655
|
}
|
2556
2656
|
|
@@ -2610,11 +2710,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2610
2710
|
src1_uma = d_Qy != nullptr;
|
2611
2711
|
}
|
2612
2712
|
|
2613
|
-
const bool
|
2614
|
-
const bool
|
2615
|
-
|
2616
|
-
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2617
|
-
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
2713
|
+
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
2714
|
+
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
2618
2715
|
|
2619
2716
|
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
2620
2717
|
|
@@ -2638,16 +2735,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2638
2735
|
uint64_t x_buf_offset = 0;
|
2639
2736
|
vk_buffer d_Y;
|
2640
2737
|
uint64_t y_buf_offset = 0;
|
2641
|
-
if
|
2642
|
-
d_Qx = ctx->prealloc_qx;
|
2643
|
-
} else if(!src1_uma) {
|
2738
|
+
if(!src0_uma) {
|
2644
2739
|
d_Qx = extra_src0->buffer_gpu.lock();
|
2645
2740
|
qx_buf_offset = extra_src0->offset;
|
2646
2741
|
GGML_ASSERT(d_Qx != nullptr);
|
2647
2742
|
}
|
2648
|
-
if
|
2649
|
-
d_Qy = ctx->prealloc_qy;
|
2650
|
-
} else if(!src1_uma) {
|
2743
|
+
if(!src1_uma) {
|
2651
2744
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2652
2745
|
qy_buf_offset = extra_src1->offset;
|
2653
2746
|
GGML_ASSERT(d_Qy != nullptr);
|
@@ -2694,15 +2787,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2694
2787
|
if (x_non_contig) {
|
2695
2788
|
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
2696
2789
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
2697
|
-
} else if (load_x) {
|
2698
|
-
// copy data to device
|
2699
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
|
2700
2790
|
}
|
2701
2791
|
if (y_non_contig) {
|
2702
2792
|
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
2703
2793
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
2704
|
-
} else if (load_y) {
|
2705
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
2706
2794
|
}
|
2707
2795
|
|
2708
2796
|
for (uint64_t i13 = 0; i13 < ne13; i13++) {
|
@@ -2783,8 +2871,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2783
2871
|
src1_uma = d_Qy != nullptr;
|
2784
2872
|
}
|
2785
2873
|
|
2786
|
-
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2787
|
-
|
2788
2874
|
const uint64_t x_ne = ne00 * ne01 * ne02;
|
2789
2875
|
const uint64_t y_ne = ne10 * ne11 * ne12;
|
2790
2876
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
@@ -2799,9 +2885,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2799
2885
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
2800
2886
|
const uint64_t qx_buf_offset = extra_src0->offset;
|
2801
2887
|
GGML_ASSERT(d_Qx != nullptr);
|
2802
|
-
if (
|
2803
|
-
d_Qy = ctx->prealloc_qy;
|
2804
|
-
} else if (!src1_uma) {
|
2888
|
+
if (!src1_uma) {
|
2805
2889
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2806
2890
|
qy_buf_offset = extra_src1->offset;
|
2807
2891
|
GGML_ASSERT(d_Qx != nullptr);
|
@@ -2816,10 +2900,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2816
2900
|
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2817
2901
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
2818
2902
|
|
2819
|
-
if (load_y) {
|
2820
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
2821
|
-
}
|
2822
|
-
|
2823
2903
|
// compute
|
2824
2904
|
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
2825
2905
|
ggml_vk_sync_buffers(subctx);
|
@@ -2875,8 +2955,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2875
2955
|
src1_uma = d_Qy != nullptr;
|
2876
2956
|
}
|
2877
2957
|
|
2878
|
-
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2879
|
-
|
2880
2958
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
2881
2959
|
|
2882
2960
|
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
|
@@ -2892,9 +2970,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2892
2970
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
2893
2971
|
const uint64_t qx_buf_offset = extra_src0->offset;
|
2894
2972
|
GGML_ASSERT(d_Qx != nullptr);
|
2895
|
-
if (
|
2896
|
-
d_Qy = ctx->prealloc_qy;
|
2897
|
-
} else {
|
2973
|
+
if (!src1_uma) {
|
2898
2974
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2899
2975
|
qy_buf_offset = extra_src1->offset;
|
2900
2976
|
GGML_ASSERT(d_Qx != nullptr);
|
@@ -2909,10 +2985,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2909
2985
|
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2910
2986
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
2911
2987
|
|
2912
|
-
if (load_y) {
|
2913
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
2914
|
-
}
|
2915
|
-
|
2916
2988
|
// compute
|
2917
2989
|
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
2918
2990
|
ggml_vk_sync_buffers(subctx);
|
@@ -3168,7 +3240,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3168
3240
|
}
|
3169
3241
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
3170
3242
|
#endif
|
3171
|
-
GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT
|
3243
|
+
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
3172
3244
|
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
3173
3245
|
GGML_ASSERT(dst->extra != nullptr);
|
3174
3246
|
const uint64_t ne00 = src0->ne[0];
|
@@ -3236,11 +3308,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3236
3308
|
}
|
3237
3309
|
}
|
3238
3310
|
|
3239
|
-
|
3240
|
-
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
3241
|
-
const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma;
|
3242
|
-
|
3243
|
-
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
3311
|
+
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
3244
3312
|
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
3245
3313
|
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
3246
3314
|
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
@@ -3255,55 +3323,43 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3255
3323
|
GGML_ASSERT(d_D != nullptr);
|
3256
3324
|
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
3257
3325
|
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
3258
|
-
if
|
3259
|
-
d_X = ctx->prealloc_qx;
|
3260
|
-
} else if(!src0_uma) {
|
3326
|
+
if(!src0_uma) {
|
3261
3327
|
d_X = extra_src0->buffer_gpu.lock();
|
3262
3328
|
x_buf_offset = extra_src0->offset;
|
3263
3329
|
GGML_ASSERT(d_X != nullptr);
|
3264
3330
|
}
|
3265
|
-
if (
|
3266
|
-
d_Y = ctx->prealloc_qy;
|
3267
|
-
} else if (use_src1 && !src1_uma) {
|
3331
|
+
if (use_src1 && !src1_uma) {
|
3268
3332
|
d_Y = extra_src1->buffer_gpu.lock();
|
3269
3333
|
y_buf_offset = extra_src1->offset;
|
3270
3334
|
GGML_ASSERT(d_Y != nullptr);
|
3271
3335
|
}
|
3272
3336
|
|
3273
|
-
GGML_ASSERT(!transfer_src2);
|
3274
3337
|
if (use_src2 && !src2_uma) {
|
3275
3338
|
d_Z = extra_src2->buffer_gpu.lock();
|
3276
3339
|
z_buf_offset = extra_src2->offset;
|
3277
3340
|
GGML_ASSERT(d_Z != nullptr);
|
3278
3341
|
}
|
3279
3342
|
|
3280
|
-
if (op == GGML_OP_CPY) {
|
3281
|
-
GGML_ASSERT(!transfer_src0);
|
3282
|
-
GGML_ASSERT(!transfer_src1);
|
3343
|
+
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
|
3283
3344
|
x_sz = ggml_nbytes(src0);
|
3345
|
+
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
3284
3346
|
d_sz = ggml_nbytes(dst);
|
3285
3347
|
|
3286
|
-
if (
|
3348
|
+
if (x_buf_offset + x_sz >= d_X->size) {
|
3287
3349
|
x_sz = VK_WHOLE_SIZE;
|
3288
3350
|
}
|
3289
|
-
if (
|
3351
|
+
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
3352
|
+
y_sz = VK_WHOLE_SIZE;
|
3353
|
+
}
|
3354
|
+
if (d_buf_offset + d_sz >= d_D->size) {
|
3290
3355
|
d_sz = VK_WHOLE_SIZE;
|
3291
3356
|
}
|
3292
3357
|
}
|
3293
3358
|
|
3294
3359
|
std::array<uint32_t, 3> elements;
|
3295
3360
|
|
3296
|
-
// copy src0 to device
|
3297
|
-
if (transfer_src0) {
|
3298
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
|
3299
|
-
ctx->staging_offset = x_sz * ne02 * ne03;
|
3300
|
-
}
|
3301
|
-
if (transfer_src1) {
|
3302
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
|
3303
|
-
}
|
3304
|
-
|
3305
3361
|
// Single call if dimension 2 is contiguous
|
3306
|
-
if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
3362
|
+
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
3307
3363
|
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
3308
3364
|
|
3309
3365
|
switch (dst->op) {
|
@@ -3316,16 +3372,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3316
3372
|
case GGML_OP_ROPE:
|
3317
3373
|
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
3318
3374
|
break;
|
3375
|
+
case GGML_OP_GET_ROWS:
|
3376
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
3377
|
+
break;
|
3319
3378
|
default:
|
3320
3379
|
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
3321
3380
|
break;
|
3322
3381
|
}
|
3323
3382
|
|
3324
|
-
if (op != GGML_OP_CPY) {
|
3383
|
+
if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
|
3325
3384
|
if (x_sz != VK_WHOLE_SIZE) {
|
3326
3385
|
x_sz *= ne02 * ne03;
|
3327
3386
|
}
|
3328
|
-
if (y_sz != VK_WHOLE_SIZE) {
|
3387
|
+
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
3329
3388
|
y_sz *= ne12 * ne13;
|
3330
3389
|
}
|
3331
3390
|
if (d_sz != VK_WHOLE_SIZE) {
|
@@ -3380,6 +3439,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3380
3439
|
case GGML_OP_ROPE:
|
3381
3440
|
elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
|
3382
3441
|
break;
|
3442
|
+
case GGML_OP_GET_ROWS:
|
3443
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
3444
|
+
break;
|
3383
3445
|
default:
|
3384
3446
|
elements = { (uint32_t)ne0, 1, 1 };
|
3385
3447
|
break;
|
@@ -3414,7 +3476,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3414
3476
|
}
|
3415
3477
|
|
3416
3478
|
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3417
|
-
|
3479
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
3480
|
+
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
3481
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
3482
|
+
|
3483
|
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
3484
|
+
(uint32_t)ggml_nelements(src0),
|
3485
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
3486
|
+
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
3487
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
3488
|
+
0,
|
3489
|
+
0.0f, 0.0f,
|
3490
|
+
});
|
3418
3491
|
}
|
3419
3492
|
|
3420
3493
|
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3570,9 +3643,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
3570
3643
|
if (is_neox) {
|
3571
3644
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
3572
3645
|
const float inv_ndims = -1.0f / n_dims;
|
3573
|
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
|
3646
|
+
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
|
3574
3647
|
} else {
|
3575
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
|
3648
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
|
3576
3649
|
}
|
3577
3650
|
}
|
3578
3651
|
|
@@ -3581,16 +3654,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
3581
3654
|
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
|
3582
3655
|
}
|
3583
3656
|
|
3584
|
-
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3585
|
-
// If backend is CPU, data from src0 has to be copied off the device
|
3586
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3587
|
-
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
3588
|
-
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
3589
|
-
ggml_vk_sync_buffers(subctx);
|
3590
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size);
|
3591
|
-
}
|
3592
|
-
}
|
3593
|
-
|
3594
3657
|
#ifdef GGML_VULKAN_RUN_TESTS
|
3595
3658
|
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
|
3596
3659
|
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
|
@@ -3613,6 +3676,8 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
|
|
3613
3676
|
val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
|
3614
3677
|
} else if (type == GGML_TYPE_F16) {
|
3615
3678
|
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
|
3679
|
+
} else {
|
3680
|
+
GGML_ASSERT(false);
|
3616
3681
|
}
|
3617
3682
|
fprintf(stderr, "% 7.2f ", val);
|
3618
3683
|
} else {
|
@@ -3914,6 +3979,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
|
3914
3979
|
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
3915
3980
|
} else if (tensor->type == GGML_TYPE_F16) {
|
3916
3981
|
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
3982
|
+
} else {
|
3983
|
+
GGML_ASSERT(false);
|
3917
3984
|
}
|
3918
3985
|
fprintf(stderr, "% 7.2f ", val);
|
3919
3986
|
} else {
|
@@ -4329,7 +4396,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
4329
4396
|
|
4330
4397
|
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
|
4331
4398
|
|
4332
|
-
if (avg_err > 0.
|
4399
|
+
if (avg_err > 0.01 || std::isnan(avg_err)) {
|
4333
4400
|
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
4334
4401
|
std::cerr << "Actual result: " << std::endl << std::endl;
|
4335
4402
|
ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
@@ -4379,27 +4446,15 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|
4379
4446
|
return extra;
|
4380
4447
|
}
|
4381
4448
|
|
4382
|
-
static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) {
|
4383
|
-
return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID;
|
4384
|
-
}
|
4385
|
-
|
4386
4449
|
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
4387
4450
|
#ifdef GGML_VULKAN_DEBUG
|
4388
4451
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
4389
4452
|
#endif
|
4390
|
-
|
4391
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4392
|
-
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
4393
|
-
|
4394
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
|
4453
|
+
if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
|
4395
4454
|
return;
|
4396
4455
|
}
|
4397
4456
|
|
4398
4457
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
4399
|
-
if (extra == nullptr) {
|
4400
|
-
// Workaround for CPU backend BLAS matmul calls
|
4401
|
-
extra = ggml_vk_tensor_create_extra(node);
|
4402
|
-
}
|
4403
4458
|
|
4404
4459
|
ggml_tensor * src0 = node->src[0];
|
4405
4460
|
ggml_tensor * src1 = node->src[1];
|
@@ -4419,7 +4474,18 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4419
4474
|
const int64_t ne22 = node->ne[2];
|
4420
4475
|
const int64_t ne23 = node->ne[3];
|
4421
4476
|
|
4422
|
-
const
|
4477
|
+
const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
|
4478
|
+
const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
|
4479
|
+
|
4480
|
+
const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
|
4481
|
+
const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
|
4482
|
+
|
4483
|
+
const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
|
4484
|
+
|
4485
|
+
bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
4486
|
+
|
4487
|
+
const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
|
4488
|
+
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
4423
4489
|
|
4424
4490
|
int split_k;
|
4425
4491
|
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
@@ -4431,10 +4497,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4431
4497
|
const uint32_t y_ne = ne10 * ne11;
|
4432
4498
|
const uint32_t d_ne = ne20 * ne21;
|
4433
4499
|
|
4434
|
-
const uint64_t
|
4435
|
-
const uint64_t
|
4436
|
-
const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
4437
|
-
const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
4500
|
+
const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
4501
|
+
const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
4438
4502
|
uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
|
4439
4503
|
const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
|
4440
4504
|
|
@@ -4477,12 +4541,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4477
4541
|
break;
|
4478
4542
|
case GGML_OP_MUL_MAT:
|
4479
4543
|
case GGML_OP_MUL_MAT_ID:
|
4480
|
-
if (ctx->prealloc_size_qx < qx_sz) {
|
4481
|
-
ctx->prealloc_size_qx = qx_sz;
|
4482
|
-
}
|
4483
|
-
if (ctx->prealloc_size_qy < qy_sz) {
|
4484
|
-
ctx->prealloc_size_qy = qy_sz;
|
4485
|
-
}
|
4486
4544
|
if (ctx->prealloc_size_x < x_sz) {
|
4487
4545
|
ctx->prealloc_size_x = x_sz;
|
4488
4546
|
}
|
@@ -4506,7 +4564,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4506
4564
|
return;
|
4507
4565
|
}
|
4508
4566
|
#ifdef GGML_VULKAN_DEBUG
|
4509
|
-
std::cerr << "ggml_vk_preallocate_buffers(
|
4567
|
+
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
4510
4568
|
#endif
|
4511
4569
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
4512
4570
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
@@ -4569,6 +4627,41 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4569
4627
|
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
4570
4628
|
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
4571
4629
|
|
4630
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
|
4631
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
|
4632
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
|
4633
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
|
4634
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
|
4635
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
|
4636
|
+
|
4637
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
|
4638
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
|
4639
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
|
4640
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
|
4641
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
|
4642
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
|
4643
|
+
|
4644
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
|
4645
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
|
4646
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
|
4647
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
|
4648
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
|
4649
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
|
4650
|
+
|
4651
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
|
4652
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
|
4653
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
|
4654
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
|
4655
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
|
4656
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
|
4657
|
+
|
4658
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
|
4659
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
|
4660
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
|
4661
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
|
4662
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
|
4663
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
|
4664
|
+
|
4572
4665
|
std::cerr << std::endl;
|
4573
4666
|
|
4574
4667
|
const std::vector<size_t> vals {
|
@@ -4608,20 +4701,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4608
4701
|
GGML_ASSERT(false);
|
4609
4702
|
#endif
|
4610
4703
|
|
4611
|
-
if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) {
|
4612
|
-
// Resize buffer
|
4613
|
-
if (ctx->prealloc_qx != nullptr) {
|
4614
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qx);
|
4615
|
-
}
|
4616
|
-
ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx);
|
4617
|
-
}
|
4618
|
-
if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) {
|
4619
|
-
// Resize buffer
|
4620
|
-
if (ctx->prealloc_qy != nullptr) {
|
4621
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qy);
|
4622
|
-
}
|
4623
|
-
ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy);
|
4624
|
-
}
|
4625
4704
|
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
4626
4705
|
// Resize buffer
|
4627
4706
|
if (ctx->prealloc_x != nullptr) {
|
@@ -4655,11 +4734,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4655
4734
|
}
|
4656
4735
|
|
4657
4736
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
4658
|
-
|
4659
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4660
|
-
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4661
|
-
|
4662
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
4737
|
+
if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
|
4663
4738
|
return;
|
4664
4739
|
}
|
4665
4740
|
|
@@ -4687,7 +4762,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4687
4762
|
}
|
4688
4763
|
break;
|
4689
4764
|
case GGML_OP_REPEAT:
|
4690
|
-
|
4765
|
+
case GGML_OP_GET_ROWS:
|
4691
4766
|
case GGML_OP_ADD:
|
4692
4767
|
case GGML_OP_MUL:
|
4693
4768
|
case GGML_OP_SCALE:
|
@@ -4711,10 +4786,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4711
4786
|
case GGML_OP_ARGSORT:
|
4712
4787
|
break;
|
4713
4788
|
default:
|
4714
|
-
|
4715
|
-
|
4716
|
-
GGML_ASSERT(false);
|
4717
|
-
}
|
4789
|
+
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
4790
|
+
GGML_ASSERT(false);
|
4718
4791
|
return;
|
4719
4792
|
}
|
4720
4793
|
|
@@ -4763,8 +4836,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4763
4836
|
case GGML_OP_PERMUTE:
|
4764
4837
|
case GGML_OP_TRANSPOSE:
|
4765
4838
|
case GGML_OP_NONE:
|
4766
|
-
ggml_vk_nop(ctx, ctx->compute_ctx, src0, node);
|
4767
|
-
|
4768
4839
|
break;
|
4769
4840
|
case GGML_OP_NORM:
|
4770
4841
|
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
@@ -4831,11 +4902,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4831
4902
|
}
|
4832
4903
|
|
4833
4904
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
4834
|
-
|
4835
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4836
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4837
|
-
|
4838
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
|
4905
|
+
if (ctx->disable) {
|
4839
4906
|
return false;
|
4840
4907
|
}
|
4841
4908
|
|
@@ -4878,10 +4945,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
4878
4945
|
break;
|
4879
4946
|
case GGML_OP_MUL_MAT:
|
4880
4947
|
case GGML_OP_MUL_MAT_ID:
|
4881
|
-
if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
4882
|
-
return false;
|
4883
|
-
}
|
4884
|
-
|
4885
4948
|
extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
4886
4949
|
|
4887
4950
|
break;
|
@@ -4995,8 +5058,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
4995
5058
|
#endif
|
4996
5059
|
ggml_vk_graph_cleanup(ctx);
|
4997
5060
|
|
4998
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qx);
|
4999
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qy);
|
5000
5061
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
5001
5062
|
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
5002
5063
|
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
@@ -5007,8 +5068,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
5007
5068
|
ggml_vk_destroy_buffer(buffer);
|
5008
5069
|
}
|
5009
5070
|
|
5010
|
-
ctx->prealloc_size_qx = 0;
|
5011
|
-
ctx->prealloc_size_qy = 0;
|
5012
5071
|
ctx->prealloc_size_x = 0;
|
5013
5072
|
ctx->prealloc_size_y = 0;
|
5014
5073
|
ctx->prealloc_size_split_k = 0;
|
@@ -5039,80 +5098,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
|
|
5039
5098
|
snprintf(description, description_size, "%s", props.deviceName.data());
|
5040
5099
|
}
|
5041
5100
|
|
5042
|
-
// CPU assist interface
|
5043
|
-
|
5044
|
-
void ggml_vk_init_cpu_assist() {
|
5045
|
-
ggml_vk_instance_init();
|
5046
|
-
|
5047
|
-
std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
|
5048
|
-
|
5049
|
-
for (int i = 0; i < ggml_vk_get_device_count(); i++) {
|
5050
|
-
ggml_vk_print_gpu_info(i);
|
5051
|
-
}
|
5052
|
-
// Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
|
5053
|
-
ggml_backend_vk_init(0);
|
5054
|
-
}
|
5055
|
-
|
5056
|
-
void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
|
5057
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5058
|
-
|
5059
|
-
if (!ctx->initialized) {
|
5060
|
-
return;
|
5061
|
-
}
|
5062
|
-
|
5063
|
-
ggml_vk_preallocate_buffers_graph(ctx, node);
|
5064
|
-
}
|
5065
|
-
|
5066
|
-
void ggml_vk_preallocate_buffers_cpu_assist() {
|
5067
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5068
|
-
|
5069
|
-
if (!ctx->initialized) {
|
5070
|
-
return;
|
5071
|
-
}
|
5072
|
-
|
5073
|
-
ggml_vk_preallocate_buffers(ctx);
|
5074
|
-
}
|
5075
|
-
|
5076
|
-
void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
|
5077
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5078
|
-
|
5079
|
-
if (!ctx->initialized) {
|
5080
|
-
return;
|
5081
|
-
}
|
5082
|
-
|
5083
|
-
ggml_vk_build_graph(ctx, node, last_node);
|
5084
|
-
}
|
5085
|
-
|
5086
|
-
bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
|
5087
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5088
|
-
|
5089
|
-
if (!ctx->initialized) {
|
5090
|
-
return false;
|
5091
|
-
}
|
5092
|
-
|
5093
|
-
return ggml_vk_compute_forward(ctx, params, tensor);
|
5094
|
-
}
|
5095
|
-
|
5096
|
-
void ggml_vk_graph_cleanup_cpu_assist() {
|
5097
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5098
|
-
|
5099
|
-
if (!ctx->initialized) {
|
5100
|
-
return;
|
5101
|
-
}
|
5102
|
-
|
5103
|
-
ggml_vk_graph_cleanup(ctx);
|
5104
|
-
}
|
5105
|
-
|
5106
|
-
void ggml_vk_free_cpu_assist() {
|
5107
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5108
|
-
|
5109
|
-
if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
|
5110
|
-
return;
|
5111
|
-
}
|
5112
|
-
|
5113
|
-
ggml_backend_vk_free(vk_instance.backends[0]);
|
5114
|
-
}
|
5115
|
-
|
5116
5101
|
// backend interface
|
5117
5102
|
|
5118
5103
|
#define UNUSED GGML_UNUSED
|
@@ -5324,16 +5309,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
5324
5309
|
/* .is_host = */ NULL,
|
5325
5310
|
};
|
5326
5311
|
|
5327
|
-
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t
|
5312
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
5328
5313
|
#ifdef GGML_VULKAN_DEBUG
|
5329
|
-
std::cerr << "ggml_backend_vk_buffer_type(" <<
|
5314
|
+
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
5330
5315
|
#endif
|
5331
5316
|
|
5332
|
-
GGML_ASSERT(
|
5317
|
+
GGML_ASSERT(dev_num < vk_instance.device_indices.size());
|
5333
5318
|
|
5334
|
-
ggml_backend_vk_init(
|
5319
|
+
ggml_backend_vk_init(dev_num);
|
5335
5320
|
|
5336
|
-
return &vk_instance.buffer_types[
|
5321
|
+
return &vk_instance.buffer_types[dev_num];
|
5337
5322
|
}
|
5338
5323
|
|
5339
5324
|
// host buffer type
|
@@ -5502,7 +5487,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
5502
5487
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
5503
5488
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
5504
5489
|
|
5505
|
-
ggml_vk_buffer_copy_async(ctx->transfer_ctx,
|
5490
|
+
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
5506
5491
|
return true;
|
5507
5492
|
}
|
5508
5493
|
|
@@ -5536,6 +5521,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
5536
5521
|
}
|
5537
5522
|
|
5538
5523
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
5524
|
+
#ifdef GGML_VULKAN_DEBUG
|
5525
|
+
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
5526
|
+
#endif
|
5539
5527
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
5540
5528
|
|
5541
5529
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
@@ -5560,7 +5548,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
5560
5548
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
5561
5549
|
ggml_tensor * node = cgraph->nodes[i];
|
5562
5550
|
|
5563
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
5551
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
5564
5552
|
continue;
|
5565
5553
|
}
|
5566
5554
|
|
@@ -5596,8 +5584,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5596
5584
|
}
|
5597
5585
|
break;
|
5598
5586
|
case GGML_OP_MUL_MAT:
|
5599
|
-
case GGML_OP_MUL_MAT_ID:
|
5587
|
+
// case GGML_OP_MUL_MAT_ID:
|
5600
5588
|
{
|
5589
|
+
switch (op->src[0]->type) {
|
5590
|
+
case GGML_TYPE_F32:
|
5591
|
+
case GGML_TYPE_F16:
|
5592
|
+
case GGML_TYPE_Q4_0:
|
5593
|
+
case GGML_TYPE_Q4_1:
|
5594
|
+
case GGML_TYPE_Q5_0:
|
5595
|
+
case GGML_TYPE_Q5_1:
|
5596
|
+
case GGML_TYPE_Q8_0:
|
5597
|
+
case GGML_TYPE_Q2_K:
|
5598
|
+
case GGML_TYPE_Q3_K:
|
5599
|
+
case GGML_TYPE_Q4_K:
|
5600
|
+
case GGML_TYPE_Q5_K:
|
5601
|
+
case GGML_TYPE_Q6_K:
|
5602
|
+
break;
|
5603
|
+
default:
|
5604
|
+
return false;
|
5605
|
+
}
|
5601
5606
|
struct ggml_tensor * a;
|
5602
5607
|
struct ggml_tensor * b;
|
5603
5608
|
if (op->op == GGML_OP_MUL_MAT) {
|
@@ -5612,25 +5617,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5612
5617
|
}
|
5613
5618
|
return true;
|
5614
5619
|
} break;
|
5615
|
-
|
5616
|
-
|
5617
|
-
|
5618
|
-
|
5619
|
-
|
5620
|
-
|
5621
|
-
|
5622
|
-
|
5623
|
-
|
5624
|
-
|
5625
|
-
|
5626
|
-
|
5627
|
-
|
5628
|
-
|
5629
|
-
|
5620
|
+
case GGML_OP_GET_ROWS:
|
5621
|
+
{
|
5622
|
+
switch (op->src[0]->type) {
|
5623
|
+
case GGML_TYPE_F32:
|
5624
|
+
case GGML_TYPE_F16:
|
5625
|
+
case GGML_TYPE_Q4_0:
|
5626
|
+
case GGML_TYPE_Q4_1:
|
5627
|
+
case GGML_TYPE_Q5_0:
|
5628
|
+
case GGML_TYPE_Q5_1:
|
5629
|
+
case GGML_TYPE_Q8_0:
|
5630
|
+
return true;
|
5631
|
+
default:
|
5632
|
+
return false;
|
5633
|
+
}
|
5634
|
+
} break;
|
5630
5635
|
case GGML_OP_CPY:
|
5636
|
+
case GGML_OP_DUP:
|
5631
5637
|
{
|
5632
5638
|
ggml_type src0_type = op->src[0]->type;
|
5633
|
-
ggml_type src1_type = op->src[1]->type;
|
5639
|
+
ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
|
5634
5640
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
5635
5641
|
return true;
|
5636
5642
|
}
|
@@ -5642,7 +5648,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5642
5648
|
}
|
5643
5649
|
return false;
|
5644
5650
|
} break;
|
5645
|
-
case GGML_OP_DUP:
|
5646
5651
|
// case GGML_OP_REPEAT:
|
5647
5652
|
// {
|
5648
5653
|
// ggml_type src0_type = op->src[0]->type;
|
@@ -5679,6 +5684,20 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5679
5684
|
UNUSED(backend);
|
5680
5685
|
}
|
5681
5686
|
|
5687
|
+
GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
5688
|
+
const ggml_tensor * dst = op;
|
5689
|
+
|
5690
|
+
const int min_batch_size = 32;
|
5691
|
+
|
5692
|
+
if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
|
5693
|
+
return true;
|
5694
|
+
}
|
5695
|
+
|
5696
|
+
return false;
|
5697
|
+
|
5698
|
+
UNUSED(backend);
|
5699
|
+
}
|
5700
|
+
|
5682
5701
|
// TODO: enable async and synchronize
|
5683
5702
|
static ggml_backend_i ggml_backend_vk_interface = {
|
5684
5703
|
/* .get_name = */ ggml_backend_vk_name,
|
@@ -5693,6 +5712,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
5693
5712
|
/* .graph_plan_compute = */ NULL,
|
5694
5713
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
5695
5714
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
5715
|
+
/* .offload_op = */ ggml_backend_vk_offload_op,
|
5696
5716
|
/* .event_new = */ NULL,
|
5697
5717
|
/* .event_free = */ NULL,
|
5698
5718
|
/* .event_record = */ NULL,
|
@@ -5705,22 +5725,22 @@ static ggml_guid_t ggml_backend_vk_guid() {
|
|
5705
5725
|
return &guid;
|
5706
5726
|
}
|
5707
5727
|
|
5708
|
-
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t
|
5709
|
-
if (vk_instance.initialized[
|
5710
|
-
return vk_instance.backends[
|
5728
|
+
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
5729
|
+
if (vk_instance.initialized[dev_num]) {
|
5730
|
+
return vk_instance.backends[dev_num];
|
5711
5731
|
}
|
5712
5732
|
#ifdef GGML_VULKAN_DEBUG
|
5713
|
-
std::cerr << "ggml_backend_vk_init(" <<
|
5733
|
+
std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
|
5714
5734
|
#endif
|
5715
5735
|
|
5716
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[
|
5717
|
-
ggml_vk_init(ctx,
|
5718
|
-
ctx->name = GGML_VK_NAME + std::to_string(
|
5719
|
-
vk_instance.buffer_types[
|
5736
|
+
ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
|
5737
|
+
ggml_vk_init(ctx, dev_num);
|
5738
|
+
ctx->name = GGML_VK_NAME + std::to_string(dev_num);
|
5739
|
+
vk_instance.buffer_types[dev_num] = {
|
5720
5740
|
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
5721
5741
|
/* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
|
5722
5742
|
};
|
5723
|
-
vk_instance.initialized[
|
5743
|
+
vk_instance.initialized[dev_num] = true;
|
5724
5744
|
|
5725
5745
|
ggml_backend_t vk_backend = new ggml_backend {
|
5726
5746
|
/* .guid = */ ggml_backend_vk_guid(),
|
@@ -5728,7 +5748,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
5728
5748
|
/* .context = */ &vk_instance.contexts[ctx->idx],
|
5729
5749
|
};
|
5730
5750
|
|
5731
|
-
vk_instance.backends[
|
5751
|
+
vk_instance.backends[dev_num] = vk_backend;
|
5732
5752
|
|
5733
5753
|
return vk_backend;
|
5734
5754
|
}
|
@@ -5772,10 +5792,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo
|
|
5772
5792
|
extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
|
5773
5793
|
|
5774
5794
|
GGML_CALL int ggml_backend_vk_reg_devices() {
|
5775
|
-
|
5795
|
+
ggml_vk_instance_init();
|
5796
|
+
|
5797
|
+
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
5776
5798
|
char name[128];
|
5777
|
-
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME,
|
5778
|
-
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(
|
5799
|
+
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
|
5800
|
+
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
|
5779
5801
|
}
|
5780
5802
|
return vk_instance.device_indices.size();
|
5781
5803
|
}
|
@@ -5859,6 +5881,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
5859
5881
|
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
5860
5882
|
} else if (tensor->type == GGML_TYPE_F16) {
|
5861
5883
|
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
5884
|
+
} else {
|
5885
|
+
GGML_ASSERT(false);
|
5862
5886
|
}
|
5863
5887
|
fprintf(stderr, "% 7.2f ", val);
|
5864
5888
|
} else {
|
@@ -5953,6 +5977,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5953
5977
|
return;
|
5954
5978
|
}
|
5955
5979
|
|
5980
|
+
#ifdef GGML_VULKAN_DEBUG
|
5981
|
+
std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
|
5982
|
+
#endif
|
5983
|
+
|
5956
5984
|
ggml_tensor * src0 = tensor->src[0];
|
5957
5985
|
ggml_tensor * src1 = tensor->src[1];
|
5958
5986
|
ggml_tensor * src2 = tensor->src[2];
|
@@ -6212,6 +6240,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6212
6240
|
tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
|
6213
6241
|
} else if (tensor->op == GGML_OP_TRANSPOSE) {
|
6214
6242
|
tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
|
6243
|
+
} else if (tensor->op == GGML_OP_GET_ROWS) {
|
6244
|
+
tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
|
6215
6245
|
} else {
|
6216
6246
|
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
6217
6247
|
GGML_ASSERT(false);
|
@@ -6262,6 +6292,10 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6262
6292
|
return;
|
6263
6293
|
}
|
6264
6294
|
|
6295
|
+
#ifdef GGML_VULKAN_DEBUG
|
6296
|
+
std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
|
6297
|
+
#endif
|
6298
|
+
|
6265
6299
|
ggml_tensor * src0 = tensor->src[0];
|
6266
6300
|
ggml_tensor * src1 = tensor->src[1];
|
6267
6301
|
|
@@ -6405,10 +6439,4 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6405
6439
|
free(tensor_data);
|
6406
6440
|
}
|
6407
6441
|
}
|
6408
|
-
|
6409
|
-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
6410
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
6411
|
-
|
6412
|
-
ggml_vk_check_results_0(ctx, params, tensor);
|
6413
|
-
}
|
6414
6442
|
#endif
|