llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +27 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +14 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +81 -20
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
- data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +141 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -12
- data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
- data/vendor/tmp/llama.cpp/llama.h +145 -29
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -9,7 +9,6 @@
|
|
9
9
|
#include <algorithm>
|
10
10
|
#include <cmath>
|
11
11
|
#include <iostream>
|
12
|
-
#include <iomanip>
|
13
12
|
#include <limits>
|
14
13
|
#include <tuple>
|
15
14
|
#include <vector>
|
@@ -340,8 +339,8 @@ struct ggml_backend_vk_context {
|
|
340
339
|
size_t semaphore_idx, event_idx;
|
341
340
|
ggml_vk_garbage_collector gc;
|
342
341
|
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
343
|
-
size_t
|
344
|
-
vk_buffer
|
342
|
+
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
|
343
|
+
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
|
345
344
|
vk::Fence fence;
|
346
345
|
vk_buffer staging;
|
347
346
|
size_t staging_size;
|
@@ -809,7 +808,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|
809
808
|
|
810
809
|
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
811
810
|
#ifdef GGML_VULKAN_DEBUG
|
812
|
-
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
811
|
+
std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
813
812
|
#endif
|
814
813
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
815
814
|
|
@@ -998,6 +997,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
998
997
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
999
998
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
|
1000
999
|
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
1000
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1001
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1002
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1003
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1004
|
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1001
1005
|
|
1002
1006
|
if (device->fp16) {
|
1003
1007
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
@@ -1055,6 +1059,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1055
1059
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1056
1060
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1057
1061
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1062
|
+
|
1063
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1064
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1065
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1066
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1067
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1068
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1069
|
+
|
1070
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1071
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1072
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1073
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1074
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1075
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1076
|
+
|
1077
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1078
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1079
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1080
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1081
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1082
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1083
|
+
|
1084
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1085
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1086
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1087
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1088
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1089
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1090
|
+
|
1091
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1092
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1093
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1094
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1095
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1096
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1058
1097
|
} else {
|
1059
1098
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
1060
1099
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
@@ -1111,6 +1150,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1111
1150
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1112
1151
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1113
1152
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1153
|
+
|
1154
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1155
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1156
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1157
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1158
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1159
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1160
|
+
|
1161
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1162
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1163
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1164
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1165
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1166
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1167
|
+
|
1168
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1169
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1170
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1171
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1172
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1173
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1174
|
+
|
1175
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1176
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1177
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1178
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1179
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1180
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1181
|
+
|
1182
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1183
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1184
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1185
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
1186
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
1187
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
1114
1188
|
}
|
1115
1189
|
|
1116
1190
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
@@ -1139,19 +1213,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1139
1213
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1140
1214
|
|
1141
1215
|
// get_rows
|
1142
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1143
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1144
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1145
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1146
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1147
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
1148
|
-
|
1149
|
-
|
1150
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1151
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1152
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1153
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1154
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
1216
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1217
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1218
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1219
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1220
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1221
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1222
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1223
|
+
|
1224
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1225
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
1226
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1227
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1228
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1229
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1230
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
1155
1231
|
|
1156
1232
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
1157
1233
|
|
@@ -1341,7 +1417,33 @@ void ggml_vk_instance_init() {
|
|
1341
1417
|
vk_instance.device_indices.push_back(tmp);
|
1342
1418
|
}
|
1343
1419
|
} else {
|
1344
|
-
vk_instance.
|
1420
|
+
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
1421
|
+
|
1422
|
+
// Make sure at least one device exists
|
1423
|
+
if (devices.empty()) {
|
1424
|
+
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
|
1425
|
+
GGML_ASSERT(false);
|
1426
|
+
}
|
1427
|
+
|
1428
|
+
// Default to using all dedicated GPUs
|
1429
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
1430
|
+
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
1431
|
+
|
1432
|
+
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
1433
|
+
vk_instance.device_indices.push_back(i);
|
1434
|
+
}
|
1435
|
+
}
|
1436
|
+
|
1437
|
+
// If no dedicated GPUs found, fall back to GPU 0
|
1438
|
+
if (vk_instance.device_indices.empty()) {
|
1439
|
+
vk_instance.device_indices.push_back(0);
|
1440
|
+
}
|
1441
|
+
}
|
1442
|
+
|
1443
|
+
std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
|
1444
|
+
|
1445
|
+
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
1446
|
+
ggml_vk_print_gpu_info(i);
|
1345
1447
|
}
|
1346
1448
|
|
1347
1449
|
vk_instance_initialized = true;
|
@@ -1567,6 +1669,15 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
1567
1669
|
|
1568
1670
|
switch (src0_type) {
|
1569
1671
|
case GGML_TYPE_Q4_0:
|
1672
|
+
case GGML_TYPE_Q4_1:
|
1673
|
+
case GGML_TYPE_Q5_0:
|
1674
|
+
case GGML_TYPE_Q5_1:
|
1675
|
+
case GGML_TYPE_Q8_0:
|
1676
|
+
case GGML_TYPE_Q2_K:
|
1677
|
+
case GGML_TYPE_Q3_K:
|
1678
|
+
case GGML_TYPE_Q4_K:
|
1679
|
+
case GGML_TYPE_Q5_K:
|
1680
|
+
case GGML_TYPE_Q6_K:
|
1570
1681
|
break;
|
1571
1682
|
default:
|
1572
1683
|
return nullptr;
|
@@ -2034,7 +2145,6 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
2034
2145
|
ggml_vk_submit(subctx, ctx->fence);
|
2035
2146
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
2036
2147
|
ctx->device->device.resetFences({ ctx->fence });
|
2037
|
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
2038
2148
|
}
|
2039
2149
|
}
|
2040
2150
|
|
@@ -2131,7 +2241,6 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
|
|
2131
2241
|
for (auto& cpy : subctx->out_memcpys) {
|
2132
2242
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
2133
2243
|
}
|
2134
|
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
2135
2244
|
}
|
2136
2245
|
}
|
2137
2246
|
|
@@ -2298,6 +2407,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2298
2407
|
return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
|
2299
2408
|
case VK_VENDOR_ID_INTEL:
|
2300
2409
|
return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
|
2410
|
+
default:
|
2411
|
+
break;
|
2301
2412
|
}
|
2302
2413
|
|
2303
2414
|
if (m <= 32 || n <= 32) {
|
@@ -2423,11 +2534,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2423
2534
|
src1_uma = d_Qy != nullptr;
|
2424
2535
|
}
|
2425
2536
|
|
2426
|
-
const bool
|
2427
|
-
const bool
|
2428
|
-
|
2429
|
-
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2430
|
-
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
2537
|
+
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
2538
|
+
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
2431
2539
|
|
2432
2540
|
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
|
2433
2541
|
|
@@ -2469,16 +2577,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2469
2577
|
uint64_t x_buf_offset = 0;
|
2470
2578
|
vk_buffer d_Y;
|
2471
2579
|
uint64_t y_buf_offset = 0;
|
2472
|
-
if (
|
2473
|
-
d_Qx = ctx->prealloc_qx;
|
2474
|
-
} else if (!src0_uma) {
|
2580
|
+
if (!src0_uma) {
|
2475
2581
|
d_Qx = extra_src0->buffer_gpu.lock();
|
2476
2582
|
qx_buf_offset = extra_src0->offset;
|
2477
2583
|
GGML_ASSERT(d_Qx != nullptr);
|
2478
2584
|
}
|
2479
|
-
if (
|
2480
|
-
d_Qy = ctx->prealloc_qy;
|
2481
|
-
} else if (!src1_uma) {
|
2585
|
+
if (!src1_uma) {
|
2482
2586
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2483
2587
|
qy_buf_offset = extra_src1->offset;
|
2484
2588
|
GGML_ASSERT(d_Qy != nullptr);
|
@@ -2530,33 +2634,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
2530
2634
|
|
2531
2635
|
if (x_non_contig) {
|
2532
2636
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
2533
|
-
} else if (
|
2534
|
-
|
2535
|
-
|
2536
|
-
|
2537
|
-
ctx->staging_offset = qx_sz * ne02 * ne03;
|
2538
|
-
}
|
2539
|
-
|
2540
|
-
if (qx_needs_dequant) {
|
2541
|
-
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
2542
|
-
ggml_vk_sync_buffers(subctx);
|
2543
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
2544
|
-
}
|
2637
|
+
} else if (qx_needs_dequant) {
|
2638
|
+
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
2639
|
+
ggml_vk_sync_buffers(subctx);
|
2640
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
2545
2641
|
}
|
2546
2642
|
if (y_non_contig) {
|
2547
2643
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
2548
|
-
} else if (load_y) {
|
2549
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
2550
2644
|
}
|
2551
2645
|
|
2552
2646
|
uint32_t stride_batch_x = ne00*ne01;
|
2553
2647
|
uint32_t stride_batch_y = ne10*ne11;
|
2554
2648
|
|
2555
|
-
if (!ggml_vk_dim01_contiguous(src0) && !
|
2649
|
+
if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
|
2556
2650
|
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
2557
2651
|
}
|
2558
2652
|
|
2559
|
-
if (!ggml_vk_dim01_contiguous(src1) && !
|
2653
|
+
if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
|
2560
2654
|
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
2561
2655
|
}
|
2562
2656
|
|
@@ -2616,11 +2710,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2616
2710
|
src1_uma = d_Qy != nullptr;
|
2617
2711
|
}
|
2618
2712
|
|
2619
|
-
const bool
|
2620
|
-
const bool
|
2621
|
-
|
2622
|
-
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
2623
|
-
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
2713
|
+
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
2714
|
+
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
2624
2715
|
|
2625
2716
|
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
2626
2717
|
|
@@ -2644,16 +2735,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2644
2735
|
uint64_t x_buf_offset = 0;
|
2645
2736
|
vk_buffer d_Y;
|
2646
2737
|
uint64_t y_buf_offset = 0;
|
2647
|
-
if
|
2648
|
-
d_Qx = ctx->prealloc_qx;
|
2649
|
-
} else if(!src1_uma) {
|
2738
|
+
if(!src0_uma) {
|
2650
2739
|
d_Qx = extra_src0->buffer_gpu.lock();
|
2651
2740
|
qx_buf_offset = extra_src0->offset;
|
2652
2741
|
GGML_ASSERT(d_Qx != nullptr);
|
2653
2742
|
}
|
2654
|
-
if
|
2655
|
-
d_Qy = ctx->prealloc_qy;
|
2656
|
-
} else if(!src1_uma) {
|
2743
|
+
if(!src1_uma) {
|
2657
2744
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2658
2745
|
qy_buf_offset = extra_src1->offset;
|
2659
2746
|
GGML_ASSERT(d_Qy != nullptr);
|
@@ -2700,15 +2787,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
2700
2787
|
if (x_non_contig) {
|
2701
2788
|
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
2702
2789
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
2703
|
-
} else if (load_x) {
|
2704
|
-
// copy data to device
|
2705
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
|
2706
2790
|
}
|
2707
2791
|
if (y_non_contig) {
|
2708
2792
|
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
2709
2793
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
2710
|
-
} else if (load_y) {
|
2711
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
2712
2794
|
}
|
2713
2795
|
|
2714
2796
|
for (uint64_t i13 = 0; i13 < ne13; i13++) {
|
@@ -2789,8 +2871,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2789
2871
|
src1_uma = d_Qy != nullptr;
|
2790
2872
|
}
|
2791
2873
|
|
2792
|
-
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2793
|
-
|
2794
2874
|
const uint64_t x_ne = ne00 * ne01 * ne02;
|
2795
2875
|
const uint64_t y_ne = ne10 * ne11 * ne12;
|
2796
2876
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
@@ -2805,9 +2885,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2805
2885
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
2806
2886
|
const uint64_t qx_buf_offset = extra_src0->offset;
|
2807
2887
|
GGML_ASSERT(d_Qx != nullptr);
|
2808
|
-
if (
|
2809
|
-
d_Qy = ctx->prealloc_qy;
|
2810
|
-
} else if (!src1_uma) {
|
2888
|
+
if (!src1_uma) {
|
2811
2889
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2812
2890
|
qy_buf_offset = extra_src1->offset;
|
2813
2891
|
GGML_ASSERT(d_Qx != nullptr);
|
@@ -2822,10 +2900,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
2822
2900
|
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2823
2901
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
2824
2902
|
|
2825
|
-
if (load_y) {
|
2826
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
2827
|
-
}
|
2828
|
-
|
2829
2903
|
// compute
|
2830
2904
|
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
2831
2905
|
ggml_vk_sync_buffers(subctx);
|
@@ -2881,8 +2955,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2881
2955
|
src1_uma = d_Qy != nullptr;
|
2882
2956
|
}
|
2883
2957
|
|
2884
|
-
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
2885
|
-
|
2886
2958
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
2887
2959
|
|
2888
2960
|
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
|
@@ -2898,9 +2970,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2898
2970
|
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
2899
2971
|
const uint64_t qx_buf_offset = extra_src0->offset;
|
2900
2972
|
GGML_ASSERT(d_Qx != nullptr);
|
2901
|
-
if (
|
2902
|
-
d_Qy = ctx->prealloc_qy;
|
2903
|
-
} else {
|
2973
|
+
if (!src1_uma) {
|
2904
2974
|
d_Qy = extra_src1->buffer_gpu.lock();
|
2905
2975
|
qy_buf_offset = extra_src1->offset;
|
2906
2976
|
GGML_ASSERT(d_Qx != nullptr);
|
@@ -2915,10 +2985,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
2915
2985
|
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
2916
2986
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
2917
2987
|
|
2918
|
-
if (load_y) {
|
2919
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
2920
|
-
}
|
2921
|
-
|
2922
2988
|
// compute
|
2923
2989
|
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
2924
2990
|
ggml_vk_sync_buffers(subctx);
|
@@ -3174,7 +3240,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3174
3240
|
}
|
3175
3241
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
3176
3242
|
#endif
|
3177
|
-
GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT
|
3243
|
+
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
3178
3244
|
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
3179
3245
|
GGML_ASSERT(dst->extra != nullptr);
|
3180
3246
|
const uint64_t ne00 = src0->ne[0];
|
@@ -3242,11 +3308,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3242
3308
|
}
|
3243
3309
|
}
|
3244
3310
|
|
3245
|
-
|
3246
|
-
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
3247
|
-
const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma;
|
3248
|
-
|
3249
|
-
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
3311
|
+
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
3250
3312
|
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
3251
3313
|
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
3252
3314
|
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
@@ -3261,55 +3323,43 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3261
3323
|
GGML_ASSERT(d_D != nullptr);
|
3262
3324
|
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
3263
3325
|
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
3264
|
-
if
|
3265
|
-
d_X = ctx->prealloc_qx;
|
3266
|
-
} else if(!src0_uma) {
|
3326
|
+
if(!src0_uma) {
|
3267
3327
|
d_X = extra_src0->buffer_gpu.lock();
|
3268
3328
|
x_buf_offset = extra_src0->offset;
|
3269
3329
|
GGML_ASSERT(d_X != nullptr);
|
3270
3330
|
}
|
3271
|
-
if (
|
3272
|
-
d_Y = ctx->prealloc_qy;
|
3273
|
-
} else if (use_src1 && !src1_uma) {
|
3331
|
+
if (use_src1 && !src1_uma) {
|
3274
3332
|
d_Y = extra_src1->buffer_gpu.lock();
|
3275
3333
|
y_buf_offset = extra_src1->offset;
|
3276
3334
|
GGML_ASSERT(d_Y != nullptr);
|
3277
3335
|
}
|
3278
3336
|
|
3279
|
-
GGML_ASSERT(!transfer_src2);
|
3280
3337
|
if (use_src2 && !src2_uma) {
|
3281
3338
|
d_Z = extra_src2->buffer_gpu.lock();
|
3282
3339
|
z_buf_offset = extra_src2->offset;
|
3283
3340
|
GGML_ASSERT(d_Z != nullptr);
|
3284
3341
|
}
|
3285
3342
|
|
3286
|
-
if (op == GGML_OP_CPY) {
|
3287
|
-
GGML_ASSERT(!transfer_src0);
|
3288
|
-
GGML_ASSERT(!transfer_src1);
|
3343
|
+
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
|
3289
3344
|
x_sz = ggml_nbytes(src0);
|
3345
|
+
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
3290
3346
|
d_sz = ggml_nbytes(dst);
|
3291
3347
|
|
3292
|
-
if (
|
3348
|
+
if (x_buf_offset + x_sz >= d_X->size) {
|
3293
3349
|
x_sz = VK_WHOLE_SIZE;
|
3294
3350
|
}
|
3295
|
-
if (
|
3351
|
+
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
3352
|
+
y_sz = VK_WHOLE_SIZE;
|
3353
|
+
}
|
3354
|
+
if (d_buf_offset + d_sz >= d_D->size) {
|
3296
3355
|
d_sz = VK_WHOLE_SIZE;
|
3297
3356
|
}
|
3298
3357
|
}
|
3299
3358
|
|
3300
3359
|
std::array<uint32_t, 3> elements;
|
3301
3360
|
|
3302
|
-
// copy src0 to device
|
3303
|
-
if (transfer_src0) {
|
3304
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
|
3305
|
-
ctx->staging_offset = x_sz * ne02 * ne03;
|
3306
|
-
}
|
3307
|
-
if (transfer_src1) {
|
3308
|
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
|
3309
|
-
}
|
3310
|
-
|
3311
3361
|
// Single call if dimension 2 is contiguous
|
3312
|
-
if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
3362
|
+
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
3313
3363
|
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
3314
3364
|
|
3315
3365
|
switch (dst->op) {
|
@@ -3322,16 +3372,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3322
3372
|
case GGML_OP_ROPE:
|
3323
3373
|
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
3324
3374
|
break;
|
3375
|
+
case GGML_OP_GET_ROWS:
|
3376
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
3377
|
+
break;
|
3325
3378
|
default:
|
3326
3379
|
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
3327
3380
|
break;
|
3328
3381
|
}
|
3329
3382
|
|
3330
|
-
if (op != GGML_OP_CPY) {
|
3383
|
+
if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
|
3331
3384
|
if (x_sz != VK_WHOLE_SIZE) {
|
3332
3385
|
x_sz *= ne02 * ne03;
|
3333
3386
|
}
|
3334
|
-
if (y_sz != VK_WHOLE_SIZE) {
|
3387
|
+
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
3335
3388
|
y_sz *= ne12 * ne13;
|
3336
3389
|
}
|
3337
3390
|
if (d_sz != VK_WHOLE_SIZE) {
|
@@ -3386,6 +3439,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3386
3439
|
case GGML_OP_ROPE:
|
3387
3440
|
elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
|
3388
3441
|
break;
|
3442
|
+
case GGML_OP_GET_ROWS:
|
3443
|
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
3444
|
+
break;
|
3389
3445
|
default:
|
3390
3446
|
elements = { (uint32_t)ne0, 1, 1 };
|
3391
3447
|
break;
|
@@ -3420,7 +3476,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
3420
3476
|
}
|
3421
3477
|
|
3422
3478
|
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3423
|
-
|
3479
|
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
3480
|
+
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
3481
|
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
3482
|
+
|
3483
|
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
3484
|
+
(uint32_t)ggml_nelements(src0),
|
3485
|
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
3486
|
+
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
3487
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
3488
|
+
0,
|
3489
|
+
0.0f, 0.0f,
|
3490
|
+
});
|
3424
3491
|
}
|
3425
3492
|
|
3426
3493
|
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3576,9 +3643,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
3576
3643
|
if (is_neox) {
|
3577
3644
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
3578
3645
|
const float inv_ndims = -1.0f / n_dims;
|
3579
|
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
|
3646
|
+
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
|
3580
3647
|
} else {
|
3581
|
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
|
3648
|
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
|
3582
3649
|
}
|
3583
3650
|
}
|
3584
3651
|
|
@@ -3587,16 +3654,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
3587
3654
|
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
|
3588
3655
|
}
|
3589
3656
|
|
3590
|
-
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
3591
|
-
// If backend is CPU, data from src0 has to be copied off the device
|
3592
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
3593
|
-
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
3594
|
-
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
3595
|
-
ggml_vk_sync_buffers(subctx);
|
3596
|
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size);
|
3597
|
-
}
|
3598
|
-
}
|
3599
|
-
|
3600
3657
|
#ifdef GGML_VULKAN_RUN_TESTS
|
3601
3658
|
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
|
3602
3659
|
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
|
@@ -3619,6 +3676,8 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
|
|
3619
3676
|
val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
|
3620
3677
|
} else if (type == GGML_TYPE_F16) {
|
3621
3678
|
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
|
3679
|
+
} else {
|
3680
|
+
GGML_ASSERT(false);
|
3622
3681
|
}
|
3623
3682
|
fprintf(stderr, "% 7.2f ", val);
|
3624
3683
|
} else {
|
@@ -3920,6 +3979,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
|
3920
3979
|
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
3921
3980
|
} else if (tensor->type == GGML_TYPE_F16) {
|
3922
3981
|
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
3982
|
+
} else {
|
3983
|
+
GGML_ASSERT(false);
|
3923
3984
|
}
|
3924
3985
|
fprintf(stderr, "% 7.2f ", val);
|
3925
3986
|
} else {
|
@@ -4335,7 +4396,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
4335
4396
|
|
4336
4397
|
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
|
4337
4398
|
|
4338
|
-
if (avg_err > 0.
|
4399
|
+
if (avg_err > 0.01 || std::isnan(avg_err)) {
|
4339
4400
|
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
4340
4401
|
std::cerr << "Actual result: " << std::endl << std::endl;
|
4341
4402
|
ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
@@ -4385,27 +4446,15 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|
4385
4446
|
return extra;
|
4386
4447
|
}
|
4387
4448
|
|
4388
|
-
static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) {
|
4389
|
-
return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID;
|
4390
|
-
}
|
4391
|
-
|
4392
4449
|
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
4393
4450
|
#ifdef GGML_VULKAN_DEBUG
|
4394
4451
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
4395
4452
|
#endif
|
4396
|
-
|
4397
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4398
|
-
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
4399
|
-
|
4400
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
|
4453
|
+
if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
|
4401
4454
|
return;
|
4402
4455
|
}
|
4403
4456
|
|
4404
4457
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
4405
|
-
if (extra == nullptr) {
|
4406
|
-
// Workaround for CPU backend BLAS matmul calls
|
4407
|
-
extra = ggml_vk_tensor_create_extra(node);
|
4408
|
-
}
|
4409
4458
|
|
4410
4459
|
ggml_tensor * src0 = node->src[0];
|
4411
4460
|
ggml_tensor * src1 = node->src[1];
|
@@ -4425,7 +4474,18 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4425
4474
|
const int64_t ne22 = node->ne[2];
|
4426
4475
|
const int64_t ne23 = node->ne[3];
|
4427
4476
|
|
4428
|
-
const
|
4477
|
+
const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
|
4478
|
+
const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
|
4479
|
+
|
4480
|
+
const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
|
4481
|
+
const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
|
4482
|
+
|
4483
|
+
const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
|
4484
|
+
|
4485
|
+
bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
4486
|
+
|
4487
|
+
const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
|
4488
|
+
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
4429
4489
|
|
4430
4490
|
int split_k;
|
4431
4491
|
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
@@ -4437,10 +4497,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4437
4497
|
const uint32_t y_ne = ne10 * ne11;
|
4438
4498
|
const uint32_t d_ne = ne20 * ne21;
|
4439
4499
|
|
4440
|
-
const uint64_t
|
4441
|
-
const uint64_t
|
4442
|
-
const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
4443
|
-
const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
4500
|
+
const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
4501
|
+
const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
4444
4502
|
uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
|
4445
4503
|
const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
|
4446
4504
|
|
@@ -4483,12 +4541,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
4483
4541
|
break;
|
4484
4542
|
case GGML_OP_MUL_MAT:
|
4485
4543
|
case GGML_OP_MUL_MAT_ID:
|
4486
|
-
if (ctx->prealloc_size_qx < qx_sz) {
|
4487
|
-
ctx->prealloc_size_qx = qx_sz;
|
4488
|
-
}
|
4489
|
-
if (ctx->prealloc_size_qy < qy_sz) {
|
4490
|
-
ctx->prealloc_size_qy = qy_sz;
|
4491
|
-
}
|
4492
4544
|
if (ctx->prealloc_size_x < x_sz) {
|
4493
4545
|
ctx->prealloc_size_x = x_sz;
|
4494
4546
|
}
|
@@ -4512,7 +4564,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4512
4564
|
return;
|
4513
4565
|
}
|
4514
4566
|
#ifdef GGML_VULKAN_DEBUG
|
4515
|
-
std::cerr << "ggml_vk_preallocate_buffers(
|
4567
|
+
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
4516
4568
|
#endif
|
4517
4569
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
4518
4570
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
@@ -4575,6 +4627,41 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4575
4627
|
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
4576
4628
|
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
4577
4629
|
|
4630
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
|
4631
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
|
4632
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
|
4633
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
|
4634
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
|
4635
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
|
4636
|
+
|
4637
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
|
4638
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
|
4639
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
|
4640
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
|
4641
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
|
4642
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
|
4643
|
+
|
4644
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
|
4645
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
|
4646
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
|
4647
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
|
4648
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
|
4649
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
|
4650
|
+
|
4651
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
|
4652
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
|
4653
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
|
4654
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
|
4655
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
|
4656
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
|
4657
|
+
|
4658
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
|
4659
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
|
4660
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
|
4661
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
|
4662
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
|
4663
|
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
|
4664
|
+
|
4578
4665
|
std::cerr << std::endl;
|
4579
4666
|
|
4580
4667
|
const std::vector<size_t> vals {
|
@@ -4614,20 +4701,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4614
4701
|
GGML_ASSERT(false);
|
4615
4702
|
#endif
|
4616
4703
|
|
4617
|
-
if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) {
|
4618
|
-
// Resize buffer
|
4619
|
-
if (ctx->prealloc_qx != nullptr) {
|
4620
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qx);
|
4621
|
-
}
|
4622
|
-
ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx);
|
4623
|
-
}
|
4624
|
-
if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) {
|
4625
|
-
// Resize buffer
|
4626
|
-
if (ctx->prealloc_qy != nullptr) {
|
4627
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qy);
|
4628
|
-
}
|
4629
|
-
ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy);
|
4630
|
-
}
|
4631
4704
|
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
4632
4705
|
// Resize buffer
|
4633
4706
|
if (ctx->prealloc_x != nullptr) {
|
@@ -4661,11 +4734,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
4661
4734
|
}
|
4662
4735
|
|
4663
4736
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
4664
|
-
|
4665
|
-
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4666
|
-
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4667
|
-
|
4668
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
4737
|
+
if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
|
4669
4738
|
return;
|
4670
4739
|
}
|
4671
4740
|
|
@@ -4693,7 +4762,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4693
4762
|
}
|
4694
4763
|
break;
|
4695
4764
|
case GGML_OP_REPEAT:
|
4696
|
-
|
4765
|
+
case GGML_OP_GET_ROWS:
|
4697
4766
|
case GGML_OP_ADD:
|
4698
4767
|
case GGML_OP_MUL:
|
4699
4768
|
case GGML_OP_SCALE:
|
@@ -4717,10 +4786,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4717
4786
|
case GGML_OP_ARGSORT:
|
4718
4787
|
break;
|
4719
4788
|
default:
|
4720
|
-
|
4721
|
-
|
4722
|
-
GGML_ASSERT(false);
|
4723
|
-
}
|
4789
|
+
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
4790
|
+
GGML_ASSERT(false);
|
4724
4791
|
return;
|
4725
4792
|
}
|
4726
4793
|
|
@@ -4769,8 +4836,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4769
4836
|
case GGML_OP_PERMUTE:
|
4770
4837
|
case GGML_OP_TRANSPOSE:
|
4771
4838
|
case GGML_OP_NONE:
|
4772
|
-
ggml_vk_nop(ctx, ctx->compute_ctx, src0, node);
|
4773
|
-
|
4774
4839
|
break;
|
4775
4840
|
case GGML_OP_NORM:
|
4776
4841
|
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
@@ -4837,11 +4902,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
4837
4902
|
}
|
4838
4903
|
|
4839
4904
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
4840
|
-
|
4841
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
4842
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
4843
|
-
|
4844
|
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
|
4905
|
+
if (ctx->disable) {
|
4845
4906
|
return false;
|
4846
4907
|
}
|
4847
4908
|
|
@@ -4884,10 +4945,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
4884
4945
|
break;
|
4885
4946
|
case GGML_OP_MUL_MAT:
|
4886
4947
|
case GGML_OP_MUL_MAT_ID:
|
4887
|
-
if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
4888
|
-
return false;
|
4889
|
-
}
|
4890
|
-
|
4891
4948
|
extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
4892
4949
|
|
4893
4950
|
break;
|
@@ -5001,8 +5058,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
5001
5058
|
#endif
|
5002
5059
|
ggml_vk_graph_cleanup(ctx);
|
5003
5060
|
|
5004
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qx);
|
5005
|
-
ggml_vk_destroy_buffer(ctx->prealloc_qy);
|
5006
5061
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
5007
5062
|
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
5008
5063
|
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
@@ -5013,8 +5068,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
5013
5068
|
ggml_vk_destroy_buffer(buffer);
|
5014
5069
|
}
|
5015
5070
|
|
5016
|
-
ctx->prealloc_size_qx = 0;
|
5017
|
-
ctx->prealloc_size_qy = 0;
|
5018
5071
|
ctx->prealloc_size_x = 0;
|
5019
5072
|
ctx->prealloc_size_y = 0;
|
5020
5073
|
ctx->prealloc_size_split_k = 0;
|
@@ -5045,80 +5098,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
|
|
5045
5098
|
snprintf(description, description_size, "%s", props.deviceName.data());
|
5046
5099
|
}
|
5047
5100
|
|
5048
|
-
// CPU assist interface
|
5049
|
-
|
5050
|
-
void ggml_vk_init_cpu_assist() {
|
5051
|
-
ggml_vk_instance_init();
|
5052
|
-
|
5053
|
-
std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
|
5054
|
-
|
5055
|
-
for (int i = 0; i < ggml_vk_get_device_count(); i++) {
|
5056
|
-
ggml_vk_print_gpu_info(i);
|
5057
|
-
}
|
5058
|
-
// Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
|
5059
|
-
ggml_backend_vk_init(0);
|
5060
|
-
}
|
5061
|
-
|
5062
|
-
void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
|
5063
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5064
|
-
|
5065
|
-
if (!ctx->initialized) {
|
5066
|
-
return;
|
5067
|
-
}
|
5068
|
-
|
5069
|
-
ggml_vk_preallocate_buffers_graph(ctx, node);
|
5070
|
-
}
|
5071
|
-
|
5072
|
-
void ggml_vk_preallocate_buffers_cpu_assist() {
|
5073
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5074
|
-
|
5075
|
-
if (!ctx->initialized) {
|
5076
|
-
return;
|
5077
|
-
}
|
5078
|
-
|
5079
|
-
ggml_vk_preallocate_buffers(ctx);
|
5080
|
-
}
|
5081
|
-
|
5082
|
-
void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
|
5083
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5084
|
-
|
5085
|
-
if (!ctx->initialized) {
|
5086
|
-
return;
|
5087
|
-
}
|
5088
|
-
|
5089
|
-
ggml_vk_build_graph(ctx, node, last_node);
|
5090
|
-
}
|
5091
|
-
|
5092
|
-
bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
|
5093
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5094
|
-
|
5095
|
-
if (!ctx->initialized) {
|
5096
|
-
return false;
|
5097
|
-
}
|
5098
|
-
|
5099
|
-
return ggml_vk_compute_forward(ctx, params, tensor);
|
5100
|
-
}
|
5101
|
-
|
5102
|
-
void ggml_vk_graph_cleanup_cpu_assist() {
|
5103
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5104
|
-
|
5105
|
-
if (!ctx->initialized) {
|
5106
|
-
return;
|
5107
|
-
}
|
5108
|
-
|
5109
|
-
ggml_vk_graph_cleanup(ctx);
|
5110
|
-
}
|
5111
|
-
|
5112
|
-
void ggml_vk_free_cpu_assist() {
|
5113
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
5114
|
-
|
5115
|
-
if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
|
5116
|
-
return;
|
5117
|
-
}
|
5118
|
-
|
5119
|
-
ggml_backend_vk_free(vk_instance.backends[0]);
|
5120
|
-
}
|
5121
|
-
|
5122
5101
|
// backend interface
|
5123
5102
|
|
5124
5103
|
#define UNUSED GGML_UNUSED
|
@@ -5330,16 +5309,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
5330
5309
|
/* .is_host = */ NULL,
|
5331
5310
|
};
|
5332
5311
|
|
5333
|
-
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t
|
5312
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
5334
5313
|
#ifdef GGML_VULKAN_DEBUG
|
5335
|
-
std::cerr << "ggml_backend_vk_buffer_type(" <<
|
5314
|
+
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
5336
5315
|
#endif
|
5337
5316
|
|
5338
|
-
GGML_ASSERT(
|
5317
|
+
GGML_ASSERT(dev_num < vk_instance.device_indices.size());
|
5339
5318
|
|
5340
|
-
ggml_backend_vk_init(
|
5319
|
+
ggml_backend_vk_init(dev_num);
|
5341
5320
|
|
5342
|
-
return &vk_instance.buffer_types[
|
5321
|
+
return &vk_instance.buffer_types[dev_num];
|
5343
5322
|
}
|
5344
5323
|
|
5345
5324
|
// host buffer type
|
@@ -5508,7 +5487,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
5508
5487
|
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
5509
5488
|
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
5510
5489
|
|
5511
|
-
ggml_vk_buffer_copy_async(ctx->transfer_ctx,
|
5490
|
+
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
5512
5491
|
return true;
|
5513
5492
|
}
|
5514
5493
|
|
@@ -5542,6 +5521,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
5542
5521
|
}
|
5543
5522
|
|
5544
5523
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
5524
|
+
#ifdef GGML_VULKAN_DEBUG
|
5525
|
+
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
5526
|
+
#endif
|
5545
5527
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
5546
5528
|
|
5547
5529
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
@@ -5566,7 +5548,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
5566
5548
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
5567
5549
|
ggml_tensor * node = cgraph->nodes[i];
|
5568
5550
|
|
5569
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
5551
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
5570
5552
|
continue;
|
5571
5553
|
}
|
5572
5554
|
|
@@ -5602,8 +5584,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5602
5584
|
}
|
5603
5585
|
break;
|
5604
5586
|
case GGML_OP_MUL_MAT:
|
5605
|
-
case GGML_OP_MUL_MAT_ID:
|
5587
|
+
// case GGML_OP_MUL_MAT_ID:
|
5606
5588
|
{
|
5589
|
+
switch (op->src[0]->type) {
|
5590
|
+
case GGML_TYPE_F32:
|
5591
|
+
case GGML_TYPE_F16:
|
5592
|
+
case GGML_TYPE_Q4_0:
|
5593
|
+
case GGML_TYPE_Q4_1:
|
5594
|
+
case GGML_TYPE_Q5_0:
|
5595
|
+
case GGML_TYPE_Q5_1:
|
5596
|
+
case GGML_TYPE_Q8_0:
|
5597
|
+
case GGML_TYPE_Q2_K:
|
5598
|
+
case GGML_TYPE_Q3_K:
|
5599
|
+
case GGML_TYPE_Q4_K:
|
5600
|
+
case GGML_TYPE_Q5_K:
|
5601
|
+
case GGML_TYPE_Q6_K:
|
5602
|
+
break;
|
5603
|
+
default:
|
5604
|
+
return false;
|
5605
|
+
}
|
5607
5606
|
struct ggml_tensor * a;
|
5608
5607
|
struct ggml_tensor * b;
|
5609
5608
|
if (op->op == GGML_OP_MUL_MAT) {
|
@@ -5618,25 +5617,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5618
5617
|
}
|
5619
5618
|
return true;
|
5620
5619
|
} break;
|
5621
|
-
|
5622
|
-
|
5623
|
-
|
5624
|
-
|
5625
|
-
|
5626
|
-
|
5627
|
-
|
5628
|
-
|
5629
|
-
|
5630
|
-
|
5631
|
-
|
5632
|
-
|
5633
|
-
|
5634
|
-
|
5635
|
-
|
5620
|
+
case GGML_OP_GET_ROWS:
|
5621
|
+
{
|
5622
|
+
switch (op->src[0]->type) {
|
5623
|
+
case GGML_TYPE_F32:
|
5624
|
+
case GGML_TYPE_F16:
|
5625
|
+
case GGML_TYPE_Q4_0:
|
5626
|
+
case GGML_TYPE_Q4_1:
|
5627
|
+
case GGML_TYPE_Q5_0:
|
5628
|
+
case GGML_TYPE_Q5_1:
|
5629
|
+
case GGML_TYPE_Q8_0:
|
5630
|
+
return true;
|
5631
|
+
default:
|
5632
|
+
return false;
|
5633
|
+
}
|
5634
|
+
} break;
|
5636
5635
|
case GGML_OP_CPY:
|
5636
|
+
case GGML_OP_DUP:
|
5637
5637
|
{
|
5638
5638
|
ggml_type src0_type = op->src[0]->type;
|
5639
|
-
ggml_type src1_type = op->src[1]->type;
|
5639
|
+
ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
|
5640
5640
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
5641
5641
|
return true;
|
5642
5642
|
}
|
@@ -5648,7 +5648,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5648
5648
|
}
|
5649
5649
|
return false;
|
5650
5650
|
} break;
|
5651
|
-
case GGML_OP_DUP:
|
5652
5651
|
// case GGML_OP_REPEAT:
|
5653
5652
|
// {
|
5654
5653
|
// ggml_type src0_type = op->src[0]->type;
|
@@ -5685,6 +5684,20 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
5685
5684
|
UNUSED(backend);
|
5686
5685
|
}
|
5687
5686
|
|
5687
|
+
GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
5688
|
+
const ggml_tensor * dst = op;
|
5689
|
+
|
5690
|
+
const int min_batch_size = 32;
|
5691
|
+
|
5692
|
+
if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
|
5693
|
+
return true;
|
5694
|
+
}
|
5695
|
+
|
5696
|
+
return false;
|
5697
|
+
|
5698
|
+
UNUSED(backend);
|
5699
|
+
}
|
5700
|
+
|
5688
5701
|
// TODO: enable async and synchronize
|
5689
5702
|
static ggml_backend_i ggml_backend_vk_interface = {
|
5690
5703
|
/* .get_name = */ ggml_backend_vk_name,
|
@@ -5699,7 +5712,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
5699
5712
|
/* .graph_plan_compute = */ NULL,
|
5700
5713
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
5701
5714
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
5702
|
-
/* .offload_op = */
|
5715
|
+
/* .offload_op = */ ggml_backend_vk_offload_op,
|
5703
5716
|
/* .event_new = */ NULL,
|
5704
5717
|
/* .event_free = */ NULL,
|
5705
5718
|
/* .event_record = */ NULL,
|
@@ -5712,22 +5725,22 @@ static ggml_guid_t ggml_backend_vk_guid() {
|
|
5712
5725
|
return &guid;
|
5713
5726
|
}
|
5714
5727
|
|
5715
|
-
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t
|
5716
|
-
if (vk_instance.initialized[
|
5717
|
-
return vk_instance.backends[
|
5728
|
+
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
5729
|
+
if (vk_instance.initialized[dev_num]) {
|
5730
|
+
return vk_instance.backends[dev_num];
|
5718
5731
|
}
|
5719
5732
|
#ifdef GGML_VULKAN_DEBUG
|
5720
|
-
std::cerr << "ggml_backend_vk_init(" <<
|
5733
|
+
std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
|
5721
5734
|
#endif
|
5722
5735
|
|
5723
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[
|
5724
|
-
ggml_vk_init(ctx,
|
5725
|
-
ctx->name = GGML_VK_NAME + std::to_string(
|
5726
|
-
vk_instance.buffer_types[
|
5736
|
+
ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
|
5737
|
+
ggml_vk_init(ctx, dev_num);
|
5738
|
+
ctx->name = GGML_VK_NAME + std::to_string(dev_num);
|
5739
|
+
vk_instance.buffer_types[dev_num] = {
|
5727
5740
|
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
5728
5741
|
/* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
|
5729
5742
|
};
|
5730
|
-
vk_instance.initialized[
|
5743
|
+
vk_instance.initialized[dev_num] = true;
|
5731
5744
|
|
5732
5745
|
ggml_backend_t vk_backend = new ggml_backend {
|
5733
5746
|
/* .guid = */ ggml_backend_vk_guid(),
|
@@ -5735,7 +5748,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
5735
5748
|
/* .context = */ &vk_instance.contexts[ctx->idx],
|
5736
5749
|
};
|
5737
5750
|
|
5738
|
-
vk_instance.backends[
|
5751
|
+
vk_instance.backends[dev_num] = vk_backend;
|
5739
5752
|
|
5740
5753
|
return vk_backend;
|
5741
5754
|
}
|
@@ -5779,10 +5792,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo
|
|
5779
5792
|
extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
|
5780
5793
|
|
5781
5794
|
GGML_CALL int ggml_backend_vk_reg_devices() {
|
5782
|
-
|
5795
|
+
ggml_vk_instance_init();
|
5796
|
+
|
5797
|
+
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
5783
5798
|
char name[128];
|
5784
|
-
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME,
|
5785
|
-
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(
|
5799
|
+
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
|
5800
|
+
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
|
5786
5801
|
}
|
5787
5802
|
return vk_instance.device_indices.size();
|
5788
5803
|
}
|
@@ -5866,6 +5881,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
5866
5881
|
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
5867
5882
|
} else if (tensor->type == GGML_TYPE_F16) {
|
5868
5883
|
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
5884
|
+
} else {
|
5885
|
+
GGML_ASSERT(false);
|
5869
5886
|
}
|
5870
5887
|
fprintf(stderr, "% 7.2f ", val);
|
5871
5888
|
} else {
|
@@ -5960,6 +5977,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5960
5977
|
return;
|
5961
5978
|
}
|
5962
5979
|
|
5980
|
+
#ifdef GGML_VULKAN_DEBUG
|
5981
|
+
std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
|
5982
|
+
#endif
|
5983
|
+
|
5963
5984
|
ggml_tensor * src0 = tensor->src[0];
|
5964
5985
|
ggml_tensor * src1 = tensor->src[1];
|
5965
5986
|
ggml_tensor * src2 = tensor->src[2];
|
@@ -6219,6 +6240,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6219
6240
|
tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
|
6220
6241
|
} else if (tensor->op == GGML_OP_TRANSPOSE) {
|
6221
6242
|
tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
|
6243
|
+
} else if (tensor->op == GGML_OP_GET_ROWS) {
|
6244
|
+
tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
|
6222
6245
|
} else {
|
6223
6246
|
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
6224
6247
|
GGML_ASSERT(false);
|
@@ -6269,6 +6292,10 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6269
6292
|
return;
|
6270
6293
|
}
|
6271
6294
|
|
6295
|
+
#ifdef GGML_VULKAN_DEBUG
|
6296
|
+
std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
|
6297
|
+
#endif
|
6298
|
+
|
6272
6299
|
ggml_tensor * src0 = tensor->src[0];
|
6273
6300
|
ggml_tensor * src1 = tensor->src[1];
|
6274
6301
|
|
@@ -6412,10 +6439,4 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6412
6439
|
free(tensor_data);
|
6413
6440
|
}
|
6414
6441
|
}
|
6415
|
-
|
6416
|
-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
6417
|
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
6418
|
-
|
6419
|
-
ggml_vk_check_results_0(ctx, params, tensor);
|
6420
|
-
}
|
6421
6442
|
#endif
|