llama_cpp 0.14.3 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -9,7 +9,6 @@
|
|
9
9
|
#include <algorithm>
|
10
10
|
#include <cmath>
|
11
11
|
#include <iostream>
|
12
|
-
#include <iomanip>
|
13
12
|
#include <limits>
|
14
13
|
#include <tuple>
|
15
14
|
#include <vector>
|
@@ -340,8 +339,8 @@ struct ggml_backend_vk_context {
|
|
340
339
|
size_t semaphore_idx, event_idx;
|
341
340
|
ggml_vk_garbage_collector gc;
|
342
341
|
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
343
|
-
size_t
|
344
|
-
vk_buffer
|
342
|
+
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
|
343
|
+
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
|
345
344
|
vk::Fence fence;
|
346
345
|
vk_buffer staging;
|
347
346
|
size_t staging_size;
|
@@ -809,7 +808,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|
809
808
|
|
810
809
|
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
811
810
|
#ifdef GGML_VULKAN_DEBUG
|
812
|
-
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
811
|
+
std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
813
812
|
#endif
|
814
813
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
815
814
|
|
@@ -998,6 +997,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
998
997
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
999
998
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
|
1000
999
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
1000
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1001
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1002
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1003
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1004
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1001
1005
|
|
1002
1006
|
if (device->fp16) {
|
1003
1007
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
@@ -1055,6 +1059,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1055
1059
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1056
1060
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1057
1061
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1062
|
+
|
1063
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1064
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1065
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1066
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1067
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1068
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1069
|
+
|
1070
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1071
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1072
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1073
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1074
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1075
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1076
|
+
|
1077
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1078
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1079
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1080
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1081
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1082
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1083
|
+
|
1084
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1085
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1086
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1087
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1088
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1089
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1090
|
+
|
1091
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1092
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1093
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1094
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1095
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1096
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1058
1097
|
} else {
|
1059
1098
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
1060
1099
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
@@ -1111,6 +1150,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1111
1150
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1112
1151
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1113
1152
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1153
|
+
|
1154
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1155
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1156
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1157
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1158
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1159
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1160
|
+
|
1161
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1162
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1163
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1164
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1165
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1166
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1167
|
+
|
1168
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1169
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1170
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1171
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1172
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1173
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1174
|
+
|
1175
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1176
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1177
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1178
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1179
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1180
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1181
|
+
|
1182
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1183
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1184
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1185
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1186
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1187
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1114
1188
|
}
|
1115
1189
|
|
1116
1190
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
@@ -1139,19 +1213,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1139
1213
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1140
1214
|
|
1141
1215
|
// get_rows
|
1142
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1143
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1144
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1145
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1146
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1147
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1148
|
-
|
1149
|
-
|
1150
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1151
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1152
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1153
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1154
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1216
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1217
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1218
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1219
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1220
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1221
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1222
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1223
|
+
|
1224
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1225
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1226
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1227
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1228
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1229
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1230
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1155
1231
|
|
1156
1232
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
1157
1233
|
|
@@ -1341,7 +1417,33 @@ void ggml_vk_instance_init() {
|
|
1341
1417
|
vk_instance.device_indices.push_back(tmp);
|
1342
1418
|
}
|
1343
1419
|
} else {
|
1344
|
-
vk_instance.
|
1420
|
+
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
1421
|
+
|
1422
|
+
// Make sure at least one device exists
|
1423
|
+
if (devices.empty()) {
|
1424
|
+
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
|
1425
|
+
GGML_ASSERT(false);
|
1426
|
+
}
|
1427
|
+
|
1428
|
+
// Default to using all dedicated GPUs
|
1429
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
1430
|
+
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
1431
|
+
|
1432
|
+
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
1433
|
+
vk_instance.device_indices.push_back(i);
|
1434
|
+
}
|
1435
|
+
}
|
1436
|
+
|
1437
|
+
// If no dedicated GPUs found, fall back to GPU 0
|
1438
|
+
if (vk_instance.device_indices.empty()) {
|
1439
|
+
vk_instance.device_indices.push_back(0);
|
1440
|
+
}
|
1441
|
+
}
|
1442
|
+
|
1443
|
+
std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
|
1444
|
+
|
1445
|
+
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
1446
|
+
ggml_vk_print_gpu_info(i);
|
1345
1447
|
}
|
1346
1448
|
|
1347
1449
|
vk_instance_initialized = true;
|
@@ -1567,6 +1669,15 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
1567
1669
|
|
1568
1670
|
switch (src0_type) {
|
1569
1671
|
case GGML_TYPE_Q4_0:
|
1672
|
+
case GGML_TYPE_Q4_1:
|
1673
|
+
case GGML_TYPE_Q5_0:
|
1674
|
+
case GGML_TYPE_Q5_1:
|
1675
|
+
case GGML_TYPE_Q8_0:
|
1676
|
+
case GGML_TYPE_Q2_K:
|
1677
|
+
case GGML_TYPE_Q3_K:
|
1678
|
+
case GGML_TYPE_Q4_K:
|
1679
|
+
case GGML_TYPE_Q5_K:
|
1680
|
+
case GGML_TYPE_Q6_K:
|
1570
1681
|
break;
|
1571
1682
|
default:
|
1572
1683
|
return nullptr;
|
@@ -2034,7 +2145,6 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
2034
2145
|
ggml_vk_submit(subctx, ctx->fence);
|
2035
2146
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
2036
2147
|
ctx->device->device.resetFences({ ctx->fence });
|
2037
|
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
2038
2148
|
}
|
2039
2149
|
}
|
2040
2150
|
|
@@ -2131,7 +2241,6 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
|
|
2131
2241
|
for (auto& cpy : subctx->out_memcpys) {
|
2132
2242
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
2133
2243
|
}
|
2134
|
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
2135
2244
|
}
|
2136
2245
|
}
|
2137
2246
|
|
@@ -2298,6 +2407,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2298
2407
|
return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
|
2299
2408
|
case VK_VENDOR_ID_INTEL:
|
2300
2409
|
return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
|
2410
|
+
default:
|
2411
|
+
break;
|
2301
2412
|
}
|
2302
2413
|
|
2303
2414
|
if (m <= 32 || n <= 32) {
|
@@ -2423,11 +2534,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2423
2534
|
src1_uma = d_Qy != nullptr;
|
2424
2535
|
}
|
2425
2536
|
|
2426
|
-
const bool
|
2427
|
-
const bool
|
2428
|
-
|
2429
|
-
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2430
|
-
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
2537
|
+
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
2538
|
+
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
2431
2539
|
|
2432
2540
|
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
|
2433
2541
|
|
@@ -2469,16 +2577,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2469
2577
|
uint64_t x_buf_offset = 0;
|
2470
2578
|
vk_buffer d_Y;
|
2471
2579
|
uint64_t y_buf_offset = 0;
|
2472
|
-
if (
|
2473
|
-
d_Qx = ctx->prealloc_qx;
|
2474
|
-
} else if (!src0_uma) {
|
2580
|
+
if (!src0_uma) {
|
2475
2581
|
d_Qx = extra_src0->buffer_gpu.lock();
|
2476
2582
|
qx_buf_offset = extra_src0->offset;
|
2477
2583
|
GGML_ASSERT(d_Qx != nullptr);
|
2478
2584
|
}
|
2479
|
-
if (
|
2480
|
-
d_Qy = ctx->prealloc_qy;
|
2481
|
-
} else if (!src1_uma) {
|
2585
|
+
if (!src1_uma) {
|
2482
2586
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2483
2587
|
qy_buf_offset = extra_src1->offset;
|
2484
2588
|
GGML_ASSERT(d_Qy != nullptr);
|
@@ -2530,33 +2634,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2530
2634
|
|
2531
2635
|
if (x_non_contig) {
|
2532
2636
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
2533
|
-
} else if (
|
2534
|
-
|
2535
|
-
|
2536
|
-
|
2537
|
-
ctx->staging_offset = qx_sz * ne02 * ne03;
|
2538
|
-
}
|
2539
|
-
|
2540
|
-
if (qx_needs_dequant) {
|
2541
|
-
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
2542
|
-
ggml_vk_sync_buffers(subctx);
|
2543
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
2544
|
-
}
|
2637
|
+
} else if (qx_needs_dequant) {
|
2638
|
+
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
2639
|
+
ggml_vk_sync_buffers(subctx);
|
2640
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
2545
2641
|
}
|
2546
2642
|
if (y_non_contig) {
|
2547
2643
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
2548
|
-
} else if (load_y) {
|
2549
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
2550
2644
|
}
|
2551
2645
|
|
2552
2646
|
uint32_t stride_batch_x = ne00*ne01;
|
2553
2647
|
uint32_t stride_batch_y = ne10*ne11;
|
2554
2648
|
|
2555
|
-
if (!ggml_vk_dim01_contiguous(src0) && !
|
2649
|
+
if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
|
2556
2650
|
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
2557
2651
|
}
|
2558
2652
|
|
2559
|
-
if (!ggml_vk_dim01_contiguous(src1) && !
|
2653
|
+
if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
|
2560
2654
|
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
2561
2655
|
}
|
2562
2656
|
|
@@ -2616,11 +2710,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2616
2710
|
src1_uma = d_Qy != nullptr;
|
2617
2711
|
}
|
2618
2712
|
|
2619
|
-
const bool
|
2620
|
-
const bool
|
2621
|
-
|
2622
|
-
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2623
|
-
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
2713
|
+
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
2714
|
+
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
2624
2715
|
|
2625
2716
|
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
2626
2717
|
|
@@ -2644,16 +2735,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2644
2735
|
uint64_t x_buf_offset = 0;
|
2645
2736
|
vk_buffer d_Y;
|
2646
2737
|
uint64_t y_buf_offset = 0;
|
2647
|
-
if
|
2648
|
-
d_Qx = ctx->prealloc_qx;
|
2649
|
-
} else if(!src1_uma) {
|
2738
|
+
if(!src0_uma) {
|
2650
2739
|
d_Qx = extra_src0->buffer_gpu.lock();
|
2651
2740
|
qx_buf_offset = extra_src0->offset;
|
2652
2741
|
GGML_ASSERT(d_Qx != nullptr);
|
2653
2742
|
}
|
2654
|
-
if
|
2655
|
-
d_Qy = ctx->prealloc_qy;
|
2656
|
-
} else if(!src1_uma) {
|
2743
|
+
if(!src1_uma) {
|
2657
2744
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2658
2745
|
qy_buf_offset = extra_src1->offset;
|
2659
2746
|
GGML_ASSERT(d_Qy != nullptr);
|
@@ -2700,15 +2787,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2700
2787
|
if (x_non_contig) {
|
2701
2788
|
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
2702
2789
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
2703
|
-
} else if (load_x) {
|
2704
|
-
// copy data to device
|
2705
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
|
2706
2790
|
}
|
2707
2791
|
if (y_non_contig) {
|
2708
2792
|
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
2709
2793
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
2710
|
-
} else if (load_y) {
|
2711
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
2712
2794
|
}
|
2713
2795
|
|
2714
2796
|
for (uint64_t i13 = 0; i13 < ne13; i13++) {
|
@@ -2789,8 +2871,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2789
2871
|
src1_uma = d_Qy != nullptr;
|
2790
2872
|
}
|
2791
2873
|
|
2792
|
-
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2793
|
-
|
2794
2874
|
const uint64_t x_ne = ne00 * ne01 * ne02;
|
2795
2875
|
const uint64_t y_ne = ne10 * ne11 * ne12;
|
2796
2876
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
@@ -2805,9 +2885,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2805
2885
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
2806
2886
|
const uint64_t qx_buf_offset = extra_src0->offset;
|
2807
2887
|
GGML_ASSERT(d_Qx != nullptr);
|
2808
|
-
if (
|
2809
|
-
d_Qy = ctx->prealloc_qy;
|
2810
|
-
} else if (!src1_uma) {
|
2888
|
+
if (!src1_uma) {
|
2811
2889
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2812
2890
|
qy_buf_offset = extra_src1->offset;
|
2813
2891
|
GGML_ASSERT(d_Qx != nullptr);
|
@@ -2822,10 +2900,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2822
2900
|
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2823
2901
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
2824
2902
|
|
2825
|
-
if (load_y) {
|
2826
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
2827
|
-
}
|
2828
|
-
|
2829
2903
|
// compute
|
2830
2904
|
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
2831
2905
|
ggml_vk_sync_buffers(subctx);
|
@@ -2881,8 +2955,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2881
2955
|
src1_uma = d_Qy != nullptr;
|
2882
2956
|
}
|
2883
2957
|
|
2884
|
-
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2885
|
-
|
2886
2958
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
2887
2959
|
|
2888
2960
|
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
|
@@ -2898,9 +2970,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2898
2970
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
2899
2971
|
const uint64_t qx_buf_offset = extra_src0->offset;
|
2900
2972
|
GGML_ASSERT(d_Qx != nullptr);
|
2901
|
-
if (
|
2902
|
-
d_Qy = ctx->prealloc_qy;
|
2903
|
-
} else {
|
2973
|
+
if (!src1_uma) {
|
2904
2974
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2905
2975
|
qy_buf_offset = extra_src1->offset;
|
2906
2976
|
GGML_ASSERT(d_Qx != nullptr);
|
@@ -2915,10 +2985,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2915
2985
|
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2916
2986
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
2917
2987
|
|
2918
|
-
if (load_y) {
|
2919
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
2920
|
-
}
|
2921
|
-
|
2922
2988
|
// compute
|
2923
2989
|
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
2924
2990
|
ggml_vk_sync_buffers(subctx);
|
@@ -3174,7 +3240,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3174
3240
|
}
|
3175
3241
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
3176
3242
|
#endif
|
3177
|
-
GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT
|
3243
|
+
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
3178
3244
|
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
3179
3245
|
GGML_ASSERT(dst->extra != nullptr);
|
3180
3246
|
const uint64_t ne00 = src0->ne[0];
|
@@ -3242,11 +3308,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3242
3308
|
}
|
3243
3309
|
}
|
3244
3310
|
|
3245
|
-
|
3246
|
-
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
3247
|
-
const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma;
|
3248
|
-
|
3249
|
-
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
3311
|
+
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
3250
3312
|
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
3251
3313
|
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
3252
3314
|
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
@@ -3261,55 +3323,43 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3261
3323
|
GGML_ASSERT(d_D != nullptr);
|
3262
3324
|
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
3263
3325
|
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
3264
|
-
if
|
3265
|
-
d_X = ctx->prealloc_qx;
|
3266
|
-
} else if(!src0_uma) {
|
3326
|
+
if(!src0_uma) {
|
3267
3327
|
d_X = extra_src0->buffer_gpu.lock();
|
3268
3328
|
x_buf_offset = extra_src0->offset;
|
3269
3329
|
GGML_ASSERT(d_X != nullptr);
|
3270
3330
|
}
|
3271
|
-
if (
|
3272
|
-
d_Y = ctx->prealloc_qy;
|
3273
|
-
} else if (use_src1 && !src1_uma) {
|
3331
|
+
if (use_src1 && !src1_uma) {
|
3274
3332
|
d_Y = extra_src1->buffer_gpu.lock();
|
3275
3333
|
y_buf_offset = extra_src1->offset;
|
3276
3334
|
GGML_ASSERT(d_Y != nullptr);
|
3277
3335
|
}
|
3278
3336
|
|
3279
|
-
GGML_ASSERT(!transfer_src2);
|
3280
3337
|
if (use_src2 && !src2_uma) {
|
3281
3338
|
d_Z = extra_src2->buffer_gpu.lock();
|
3282
3339
|
z_buf_offset = extra_src2->offset;
|
3283
3340
|
GGML_ASSERT(d_Z != nullptr);
|
3284
3341
|
}
|
3285
3342
|
|
3286
|
-
if (op == GGML_OP_CPY) {
|
3287
|
-
GGML_ASSERT(!transfer_src0);
|
3288
|
-
GGML_ASSERT(!transfer_src1);
|
3343
|
+
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
|
3289
3344
|
x_sz = ggml_nbytes(src0);
|
3345
|
+
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
3290
3346
|
d_sz = ggml_nbytes(dst);
|
3291
3347
|
|
3292
|
-
if (
|
3348
|
+
if (x_buf_offset + x_sz >= d_X->size) {
|
3293
3349
|
x_sz = VK_WHOLE_SIZE;
|
3294
3350
|
}
|
3295
|
-
if (
|
3351
|
+
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
3352
|
+
y_sz = VK_WHOLE_SIZE;
|
3353
|
+
}
|
3354
|
+
if (d_buf_offset + d_sz >= d_D->size) {
|
3296
3355
|
d_sz = VK_WHOLE_SIZE;
|
3297
3356
|
}
|
3298
3357
|
}
|
3299
3358
|
|
3300
3359
|
std::array<uint32_t, 3> elements;
|
3301
3360
|
|
3302
|
-
// copy src0 to device
|
3303
|
-
if (transfer_src0) {
|
3304
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
|
3305
|
-
ctx->staging_offset = x_sz * ne02 * ne03;
|
3306
|
-
}
|
3307
|
-
if (transfer_src1) {
|
3308
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
|
3309
|
-
}
|
3310
|
-
|
3311
3361
|
// Single call if dimension 2 is contiguous
|
3312
|
-
if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
3362
|
+
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
3313
3363
|
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
3314
3364
|
|
3315
3365
|
switch (dst->op) {
|
@@ -3322,16 +3372,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3322
3372
|
case GGML_OP_ROPE:
|
3323
3373
|
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
3324
3374
|
break;
|
3375
|
+
case GGML_OP_GET_ROWS:
|
3376
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
3377
|
+
break;
|
3325
3378
|
default:
|
3326
3379
|
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
3327
3380
|
break;
|
3328
3381
|
}
|
3329
3382
|
|
3330
|
-
if (op != GGML_OP_CPY) {
|
3383
|
+
if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
|
3331
3384
|
if (x_sz != VK_WHOLE_SIZE) {
|
3332
3385
|
x_sz *= ne02 * ne03;
|
3333
3386
|
}
|
3334
|
-
if (y_sz != VK_WHOLE_SIZE) {
|
3387
|
+
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
3335
3388
|
y_sz *= ne12 * ne13;
|
3336
3389
|
}
|
3337
3390
|
if (d_sz != VK_WHOLE_SIZE) {
|
@@ -3386,6 +3439,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3386
3439
|
case GGML_OP_ROPE:
|
3387
3440
|
elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
|
3388
3441
|
break;
|
3442
|
+
case GGML_OP_GET_ROWS:
|
3443
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
3444
|
+
break;
|
3389
3445
|
default:
|
3390
3446
|
elements = { (uint32_t)ne0, 1, 1 };
|
3391
3447
|
break;
|
@@ -3420,7 +3476,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3420
3476
|
}
|
3421
3477
|
|
3422
3478
|
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3423
|
-
|
3479
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
3480
|
+
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
3481
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
3482
|
+
|
3483
|
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
3484
|
+
(uint32_t)ggml_nelements(src0),
|
3485
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
3486
|
+
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
3487
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
3488
|
+
0,
|
3489
|
+
0.0f, 0.0f,
|
3490
|
+
});
|
3424
3491
|
}
|
3425
3492
|
|
3426
3493
|
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3576,9 +3643,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
3576
3643
|
if (is_neox) {
|
3577
3644
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
3578
3645
|
const float inv_ndims = -1.0f / n_dims;
|
3579
|
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
|
3646
|
+
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
|
3580
3647
|
} else {
|
3581
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
|
3648
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
|
3582
3649
|
}
|
3583
3650
|
}
|
3584
3651
|
|
@@ -3587,16 +3654,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
3587
3654
|
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
|
3588
3655
|
}
|
3589
3656
|
|
3590
|
-
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3591
|
-
// If backend is CPU, data from src0 has to be copied off the device
|
3592
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3593
|
-
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
3594
|
-
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
3595
|
-
ggml_vk_sync_buffers(subctx);
|
3596
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size);
|
3597
|
-
}
|
3598
|
-
}
|
3599
|
-
|
3600
3657
|
#ifdef GGML_VULKAN_RUN_TESTS
|
3601
3658
|
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
|
3602
3659
|
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
|
@@ -3619,6 +3676,8 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
|
|
3619
3676
|
val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
|
3620
3677
|
} else if (type == GGML_TYPE_F16) {
|
3621
3678
|
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
|
3679
|
+
} else {
|
3680
|
+
GGML_ASSERT(false);
|
3622
3681
|
}
|
3623
3682
|
fprintf(stderr, "% 7.2f ", val);
|
3624
3683
|
} else {
|
@@ -3920,6 +3979,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
|
3920
3979
|
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
3921
3980
|
} else if (tensor->type == GGML_TYPE_F16) {
|
3922
3981
|
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
3982
|
+
} else {
|
3983
|
+
GGML_ASSERT(false);
|
3923
3984
|
}
|
3924
3985
|
fprintf(stderr, "% 7.2f ", val);
|
3925
3986
|
} else {
|
@@ -4335,7 +4396,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
4335
4396
|
|
4336
4397
|
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
|
4337
4398
|
|
4338
|
-
if (avg_err > 0.
|
4399
|
+
if (avg_err > 0.01 || std::isnan(avg_err)) {
|
4339
4400
|
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
4340
4401
|
std::cerr << "Actual result: " << std::endl << std::endl;
|
4341
4402
|
ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
@@ -4385,27 +4446,15 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|
4385
4446
|
return extra;
|
4386
4447
|
}
|
4387
4448
|
|
4388
|
-
static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) {
|
4389
|
-
return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID;
|
4390
|
-
}
|
4391
|
-
|
4392
4449
|
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
4393
4450
|
#ifdef GGML_VULKAN_DEBUG
|
4394
4451
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
4395
4452
|
#endif
|
4396
|
-
|
4397
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4398
|
-
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
4399
|
-
|
4400
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
|
4453
|
+
if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
|
4401
4454
|
return;
|
4402
4455
|
}
|
4403
4456
|
|
4404
4457
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
4405
|
-
if (extra == nullptr) {
|
4406
|
-
// Workaround for CPU backend BLAS matmul calls
|
4407
|
-
extra = ggml_vk_tensor_create_extra(node);
|
4408
|
-
}
|
4409
4458
|
|
4410
4459
|
ggml_tensor * src0 = node->src[0];
|
4411
4460
|
ggml_tensor * src1 = node->src[1];
|
@@ -4425,7 +4474,18 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4425
4474
|
const int64_t ne22 = node->ne[2];
|
4426
4475
|
const int64_t ne23 = node->ne[3];
|
4427
4476
|
|
4428
|
-
const
|
4477
|
+
const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
|
4478
|
+
const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
|
4479
|
+
|
4480
|
+
const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
|
4481
|
+
const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
|
4482
|
+
|
4483
|
+
const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
|
4484
|
+
|
4485
|
+
bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
4486
|
+
|
4487
|
+
const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
|
4488
|
+
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
4429
4489
|
|
4430
4490
|
int split_k;
|
4431
4491
|
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
@@ -4437,10 +4497,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4437
4497
|
const uint32_t y_ne = ne10 * ne11;
|
4438
4498
|
const uint32_t d_ne = ne20 * ne21;
|
4439
4499
|
|
4440
|
-
const uint64_t
|
4441
|
-
const uint64_t
|
4442
|
-
const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
4443
|
-
const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
4500
|
+
const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
4501
|
+
const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
4444
4502
|
uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
|
4445
4503
|
const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
|
4446
4504
|
|
@@ -4483,12 +4541,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4483
4541
|
break;
|
4484
4542
|
case GGML_OP_MUL_MAT:
|
4485
4543
|
case GGML_OP_MUL_MAT_ID:
|
4486
|
-
if (ctx->prealloc_size_qx < qx_sz) {
|
4487
|
-
ctx->prealloc_size_qx = qx_sz;
|
4488
|
-
}
|
4489
|
-
if (ctx->prealloc_size_qy < qy_sz) {
|
4490
|
-
ctx->prealloc_size_qy = qy_sz;
|
4491
|
-
}
|
4492
4544
|
if (ctx->prealloc_size_x < x_sz) {
|
4493
4545
|
ctx->prealloc_size_x = x_sz;
|
4494
4546
|
}
|
@@ -4512,7 +4564,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4512
4564
|
return;
|
4513
4565
|
}
|
4514
4566
|
#ifdef GGML_VULKAN_DEBUG
|
4515
|
-
std::cerr << "ggml_vk_preallocate_buffers(
|
4567
|
+
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
4516
4568
|
#endif
|
4517
4569
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
4518
4570
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
@@ -4575,6 +4627,41 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4575
4627
|
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
4576
4628
|
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
4577
4629
|
|
4630
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
|
4631
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
|
4632
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
|
4633
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
|
4634
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
|
4635
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
|
4636
|
+
|
4637
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
|
4638
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
|
4639
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
|
4640
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
|
4641
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
|
4642
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
|
4643
|
+
|
4644
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
|
4645
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
|
4646
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
|
4647
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
|
4648
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
|
4649
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
|
4650
|
+
|
4651
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
|
4652
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
|
4653
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
|
4654
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
|
4655
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
|
4656
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
|
4657
|
+
|
4658
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
|
4659
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
|
4660
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
|
4661
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
|
4662
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
|
4663
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
|
4664
|
+
|
4578
4665
|
std::cerr << std::endl;
|
4579
4666
|
|
4580
4667
|
const std::vector<size_t> vals {
|
@@ -4614,20 +4701,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4614
4701
|
GGML_ASSERT(false);
|
4615
4702
|
#endif
|
4616
4703
|
|
4617
|
-
if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) {
|
4618
|
-
// Resize buffer
|
4619
|
-
if (ctx->prealloc_qx != nullptr) {
|
4620
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qx);
|
4621
|
-
}
|
4622
|
-
ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx);
|
4623
|
-
}
|
4624
|
-
if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) {
|
4625
|
-
// Resize buffer
|
4626
|
-
if (ctx->prealloc_qy != nullptr) {
|
4627
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qy);
|
4628
|
-
}
|
4629
|
-
ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy);
|
4630
|
-
}
|
4631
4704
|
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
4632
4705
|
// Resize buffer
|
4633
4706
|
if (ctx->prealloc_x != nullptr) {
|
@@ -4661,11 +4734,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4661
4734
|
}
|
4662
4735
|
|
4663
4736
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
4664
|
-
|
4665
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4666
|
-
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4667
|
-
|
4668
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
4737
|
+
if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
|
4669
4738
|
return;
|
4670
4739
|
}
|
4671
4740
|
|
@@ -4693,7 +4762,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4693
4762
|
}
|
4694
4763
|
break;
|
4695
4764
|
case GGML_OP_REPEAT:
|
4696
|
-
|
4765
|
+
case GGML_OP_GET_ROWS:
|
4697
4766
|
case GGML_OP_ADD:
|
4698
4767
|
case GGML_OP_MUL:
|
4699
4768
|
case GGML_OP_SCALE:
|
@@ -4717,10 +4786,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4717
4786
|
case GGML_OP_ARGSORT:
|
4718
4787
|
break;
|
4719
4788
|
default:
|
4720
|
-
|
4721
|
-
|
4722
|
-
GGML_ASSERT(false);
|
4723
|
-
}
|
4789
|
+
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
4790
|
+
GGML_ASSERT(false);
|
4724
4791
|
return;
|
4725
4792
|
}
|
4726
4793
|
|
@@ -4769,8 +4836,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4769
4836
|
case GGML_OP_PERMUTE:
|
4770
4837
|
case GGML_OP_TRANSPOSE:
|
4771
4838
|
case GGML_OP_NONE:
|
4772
|
-
ggml_vk_nop(ctx, ctx->compute_ctx, src0, node);
|
4773
|
-
|
4774
4839
|
break;
|
4775
4840
|
case GGML_OP_NORM:
|
4776
4841
|
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
@@ -4837,11 +4902,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4837
4902
|
}
|
4838
4903
|
|
4839
4904
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
4840
|
-
|
4841
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4842
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4843
|
-
|
4844
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
|
4905
|
+
if (ctx->disable) {
|
4845
4906
|
return false;
|
4846
4907
|
}
|
4847
4908
|
|
@@ -4884,10 +4945,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
4884
4945
|
break;
|
4885
4946
|
case GGML_OP_MUL_MAT:
|
4886
4947
|
case GGML_OP_MUL_MAT_ID:
|
4887
|
-
if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
4888
|
-
return false;
|
4889
|
-
}
|
4890
|
-
|
4891
4948
|
extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
4892
4949
|
|
4893
4950
|
break;
|
@@ -5001,8 +5058,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
5001
5058
|
#endif
|
5002
5059
|
ggml_vk_graph_cleanup(ctx);
|
5003
5060
|
|
5004
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qx);
|
5005
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qy);
|
5006
5061
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
5007
5062
|
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
5008
5063
|
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
@@ -5013,8 +5068,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
5013
5068
|
ggml_vk_destroy_buffer(buffer);
|
5014
5069
|
}
|
5015
5070
|
|
5016
|
-
ctx->prealloc_size_qx = 0;
|
5017
|
-
ctx->prealloc_size_qy = 0;
|
5018
5071
|
ctx->prealloc_size_x = 0;
|
5019
5072
|
ctx->prealloc_size_y = 0;
|
5020
5073
|
ctx->prealloc_size_split_k = 0;
|
@@ -5045,80 +5098,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
|
|
5045
5098
|
snprintf(description, description_size, "%s", props.deviceName.data());
|
5046
5099
|
}
|
5047
5100
|
|
5048
|
-
// CPU assist interface
|
5049
|
-
|
5050
|
-
void ggml_vk_init_cpu_assist() {
|
5051
|
-
ggml_vk_instance_init();
|
5052
|
-
|
5053
|
-
std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
|
5054
|
-
|
5055
|
-
for (int i = 0; i < ggml_vk_get_device_count(); i++) {
|
5056
|
-
ggml_vk_print_gpu_info(i);
|
5057
|
-
}
|
5058
|
-
// Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
|
5059
|
-
ggml_backend_vk_init(0);
|
5060
|
-
}
|
5061
|
-
|
5062
|
-
void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
|
5063
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5064
|
-
|
5065
|
-
if (!ctx->initialized) {
|
5066
|
-
return;
|
5067
|
-
}
|
5068
|
-
|
5069
|
-
ggml_vk_preallocate_buffers_graph(ctx, node);
|
5070
|
-
}
|
5071
|
-
|
5072
|
-
void ggml_vk_preallocate_buffers_cpu_assist() {
|
5073
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5074
|
-
|
5075
|
-
if (!ctx->initialized) {
|
5076
|
-
return;
|
5077
|
-
}
|
5078
|
-
|
5079
|
-
ggml_vk_preallocate_buffers(ctx);
|
5080
|
-
}
|
5081
|
-
|
5082
|
-
void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
|
5083
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5084
|
-
|
5085
|
-
if (!ctx->initialized) {
|
5086
|
-
return;
|
5087
|
-
}
|
5088
|
-
|
5089
|
-
ggml_vk_build_graph(ctx, node, last_node);
|
5090
|
-
}
|
5091
|
-
|
5092
|
-
bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
|
5093
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5094
|
-
|
5095
|
-
if (!ctx->initialized) {
|
5096
|
-
return false;
|
5097
|
-
}
|
5098
|
-
|
5099
|
-
return ggml_vk_compute_forward(ctx, params, tensor);
|
5100
|
-
}
|
5101
|
-
|
5102
|
-
void ggml_vk_graph_cleanup_cpu_assist() {
|
5103
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5104
|
-
|
5105
|
-
if (!ctx->initialized) {
|
5106
|
-
return;
|
5107
|
-
}
|
5108
|
-
|
5109
|
-
ggml_vk_graph_cleanup(ctx);
|
5110
|
-
}
|
5111
|
-
|
5112
|
-
void ggml_vk_free_cpu_assist() {
|
5113
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5114
|
-
|
5115
|
-
if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
|
5116
|
-
return;
|
5117
|
-
}
|
5118
|
-
|
5119
|
-
ggml_backend_vk_free(vk_instance.backends[0]);
|
5120
|
-
}
|
5121
|
-
|
5122
5101
|
// backend interface
|
5123
5102
|
|
5124
5103
|
#define UNUSED GGML_UNUSED
|
@@ -5330,16 +5309,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
5330
5309
|
/* .is_host = */ NULL,
|
5331
5310
|
};
|
5332
5311
|
|
5333
|
-
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t
|
5312
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
5334
5313
|
#ifdef GGML_VULKAN_DEBUG
|
5335
|
-
std::cerr << "ggml_backend_vk_buffer_type(" <<
|
5314
|
+
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
5336
5315
|
#endif
|
5337
5316
|
|
5338
|
-
GGML_ASSERT(
|
5317
|
+
GGML_ASSERT(dev_num < vk_instance.device_indices.size());
|
5339
5318
|
|
5340
|
-
ggml_backend_vk_init(
|
5319
|
+
ggml_backend_vk_init(dev_num);
|
5341
5320
|
|
5342
|
-
return &vk_instance.buffer_types[
|
5321
|
+
return &vk_instance.buffer_types[dev_num];
|
5343
5322
|
}
|
5344
5323
|
|
5345
5324
|
// host buffer type
|
@@ -5508,7 +5487,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
5508
5487
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
5509
5488
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
5510
5489
|
|
5511
|
-
ggml_vk_buffer_copy_async(ctx->transfer_ctx,
|
5490
|
+
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
5512
5491
|
return true;
|
5513
5492
|
}
|
5514
5493
|
|
@@ -5542,6 +5521,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
5542
5521
|
}
|
5543
5522
|
|
5544
5523
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
5524
|
+
#ifdef GGML_VULKAN_DEBUG
|
5525
|
+
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
5526
|
+
#endif
|
5545
5527
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
5546
5528
|
|
5547
5529
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
@@ -5566,7 +5548,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
5566
5548
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
5567
5549
|
ggml_tensor * node = cgraph->nodes[i];
|
5568
5550
|
|
5569
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
5551
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
5570
5552
|
continue;
|
5571
5553
|
}
|
5572
5554
|
|
@@ -5602,8 +5584,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5602
5584
|
}
|
5603
5585
|
break;
|
5604
5586
|
case GGML_OP_MUL_MAT:
|
5605
|
-
case GGML_OP_MUL_MAT_ID:
|
5587
|
+
// case GGML_OP_MUL_MAT_ID:
|
5606
5588
|
{
|
5589
|
+
switch (op->src[0]->type) {
|
5590
|
+
case GGML_TYPE_F32:
|
5591
|
+
case GGML_TYPE_F16:
|
5592
|
+
case GGML_TYPE_Q4_0:
|
5593
|
+
case GGML_TYPE_Q4_1:
|
5594
|
+
case GGML_TYPE_Q5_0:
|
5595
|
+
case GGML_TYPE_Q5_1:
|
5596
|
+
case GGML_TYPE_Q8_0:
|
5597
|
+
case GGML_TYPE_Q2_K:
|
5598
|
+
case GGML_TYPE_Q3_K:
|
5599
|
+
case GGML_TYPE_Q4_K:
|
5600
|
+
case GGML_TYPE_Q5_K:
|
5601
|
+
case GGML_TYPE_Q6_K:
|
5602
|
+
break;
|
5603
|
+
default:
|
5604
|
+
return false;
|
5605
|
+
}
|
5607
5606
|
struct ggml_tensor * a;
|
5608
5607
|
struct ggml_tensor * b;
|
5609
5608
|
if (op->op == GGML_OP_MUL_MAT) {
|
@@ -5618,25 +5617,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5618
5617
|
}
|
5619
5618
|
return true;
|
5620
5619
|
} break;
|
5621
|
-
|
5622
|
-
|
5623
|
-
|
5624
|
-
|
5625
|
-
|
5626
|
-
|
5627
|
-
|
5628
|
-
|
5629
|
-
|
5630
|
-
|
5631
|
-
|
5632
|
-
|
5633
|
-
|
5634
|
-
|
5635
|
-
|
5620
|
+
case GGML_OP_GET_ROWS:
|
5621
|
+
{
|
5622
|
+
switch (op->src[0]->type) {
|
5623
|
+
case GGML_TYPE_F32:
|
5624
|
+
case GGML_TYPE_F16:
|
5625
|
+
case GGML_TYPE_Q4_0:
|
5626
|
+
case GGML_TYPE_Q4_1:
|
5627
|
+
case GGML_TYPE_Q5_0:
|
5628
|
+
case GGML_TYPE_Q5_1:
|
5629
|
+
case GGML_TYPE_Q8_0:
|
5630
|
+
return true;
|
5631
|
+
default:
|
5632
|
+
return false;
|
5633
|
+
}
|
5634
|
+
} break;
|
5636
5635
|
case GGML_OP_CPY:
|
5636
|
+
case GGML_OP_DUP:
|
5637
5637
|
{
|
5638
5638
|
ggml_type src0_type = op->src[0]->type;
|
5639
|
-
ggml_type src1_type = op->src[1]->type;
|
5639
|
+
ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
|
5640
5640
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
5641
5641
|
return true;
|
5642
5642
|
}
|
@@ -5648,7 +5648,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5648
5648
|
}
|
5649
5649
|
return false;
|
5650
5650
|
} break;
|
5651
|
-
case GGML_OP_DUP:
|
5652
5651
|
// case GGML_OP_REPEAT:
|
5653
5652
|
// {
|
5654
5653
|
// ggml_type src0_type = op->src[0]->type;
|
@@ -5685,6 +5684,20 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5685
5684
|
UNUSED(backend);
|
5686
5685
|
}
|
5687
5686
|
|
5687
|
+
GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
5688
|
+
const ggml_tensor * dst = op;
|
5689
|
+
|
5690
|
+
const int min_batch_size = 32;
|
5691
|
+
|
5692
|
+
if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
|
5693
|
+
return true;
|
5694
|
+
}
|
5695
|
+
|
5696
|
+
return false;
|
5697
|
+
|
5698
|
+
UNUSED(backend);
|
5699
|
+
}
|
5700
|
+
|
5688
5701
|
// TODO: enable async and synchronize
|
5689
5702
|
static ggml_backend_i ggml_backend_vk_interface = {
|
5690
5703
|
/* .get_name = */ ggml_backend_vk_name,
|
@@ -5699,7 +5712,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
5699
5712
|
/* .graph_plan_compute = */ NULL,
|
5700
5713
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
5701
5714
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
5702
|
-
/* .offload_op = */
|
5715
|
+
/* .offload_op = */ ggml_backend_vk_offload_op,
|
5703
5716
|
/* .event_new = */ NULL,
|
5704
5717
|
/* .event_free = */ NULL,
|
5705
5718
|
/* .event_record = */ NULL,
|
@@ -5712,22 +5725,22 @@ static ggml_guid_t ggml_backend_vk_guid() {
|
|
5712
5725
|
return &guid;
|
5713
5726
|
}
|
5714
5727
|
|
5715
|
-
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t
|
5716
|
-
if (vk_instance.initialized[
|
5717
|
-
return vk_instance.backends[
|
5728
|
+
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
5729
|
+
if (vk_instance.initialized[dev_num]) {
|
5730
|
+
return vk_instance.backends[dev_num];
|
5718
5731
|
}
|
5719
5732
|
#ifdef GGML_VULKAN_DEBUG
|
5720
|
-
std::cerr << "ggml_backend_vk_init(" <<
|
5733
|
+
std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
|
5721
5734
|
#endif
|
5722
5735
|
|
5723
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[
|
5724
|
-
ggml_vk_init(ctx,
|
5725
|
-
ctx->name = GGML_VK_NAME + std::to_string(
|
5726
|
-
vk_instance.buffer_types[
|
5736
|
+
ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
|
5737
|
+
ggml_vk_init(ctx, dev_num);
|
5738
|
+
ctx->name = GGML_VK_NAME + std::to_string(dev_num);
|
5739
|
+
vk_instance.buffer_types[dev_num] = {
|
5727
5740
|
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
5728
5741
|
/* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
|
5729
5742
|
};
|
5730
|
-
vk_instance.initialized[
|
5743
|
+
vk_instance.initialized[dev_num] = true;
|
5731
5744
|
|
5732
5745
|
ggml_backend_t vk_backend = new ggml_backend {
|
5733
5746
|
/* .guid = */ ggml_backend_vk_guid(),
|
@@ -5735,7 +5748,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
5735
5748
|
/* .context = */ &vk_instance.contexts[ctx->idx],
|
5736
5749
|
};
|
5737
5750
|
|
5738
|
-
vk_instance.backends[
|
5751
|
+
vk_instance.backends[dev_num] = vk_backend;
|
5739
5752
|
|
5740
5753
|
return vk_backend;
|
5741
5754
|
}
|
@@ -5779,10 +5792,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo
|
|
5779
5792
|
extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
|
5780
5793
|
|
5781
5794
|
GGML_CALL int ggml_backend_vk_reg_devices() {
|
5782
|
-
|
5795
|
+
ggml_vk_instance_init();
|
5796
|
+
|
5797
|
+
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
5783
5798
|
char name[128];
|
5784
|
-
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME,
|
5785
|
-
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(
|
5799
|
+
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
|
5800
|
+
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
|
5786
5801
|
}
|
5787
5802
|
return vk_instance.device_indices.size();
|
5788
5803
|
}
|
@@ -5866,6 +5881,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
5866
5881
|
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
5867
5882
|
} else if (tensor->type == GGML_TYPE_F16) {
|
5868
5883
|
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
5884
|
+
} else {
|
5885
|
+
GGML_ASSERT(false);
|
5869
5886
|
}
|
5870
5887
|
fprintf(stderr, "% 7.2f ", val);
|
5871
5888
|
} else {
|
@@ -5960,6 +5977,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5960
5977
|
return;
|
5961
5978
|
}
|
5962
5979
|
|
5980
|
+
#ifdef GGML_VULKAN_DEBUG
|
5981
|
+
std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
|
5982
|
+
#endif
|
5983
|
+
|
5963
5984
|
ggml_tensor * src0 = tensor->src[0];
|
5964
5985
|
ggml_tensor * src1 = tensor->src[1];
|
5965
5986
|
ggml_tensor * src2 = tensor->src[2];
|
@@ -6219,6 +6240,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6219
6240
|
tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
|
6220
6241
|
} else if (tensor->op == GGML_OP_TRANSPOSE) {
|
6221
6242
|
tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
|
6243
|
+
} else if (tensor->op == GGML_OP_GET_ROWS) {
|
6244
|
+
tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
|
6222
6245
|
} else {
|
6223
6246
|
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
6224
6247
|
GGML_ASSERT(false);
|
@@ -6269,6 +6292,10 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6269
6292
|
return;
|
6270
6293
|
}
|
6271
6294
|
|
6295
|
+
#ifdef GGML_VULKAN_DEBUG
|
6296
|
+
std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
|
6297
|
+
#endif
|
6298
|
+
|
6272
6299
|
ggml_tensor * src0 = tensor->src[0];
|
6273
6300
|
ggml_tensor * src1 = tensor->src[1];
|
6274
6301
|
|
@@ -6412,10 +6439,4 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6412
6439
|
free(tensor_data);
|
6413
6440
|
}
|
6414
6441
|
}
|
6415
|
-
|
6416
|
-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
6417
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
6418
|
-
|
6419
|
-
ggml_vk_check_results_0(ctx, params, tensor);
|
6420
|
-
}
|
6421
6442
|
#endif
|