llama_cpp 0.14.3 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,7 +9,6 @@
9
9
  #include <algorithm>
10
10
  #include <cmath>
11
11
  #include <iostream>
12
- #include <iomanip>
13
12
  #include <limits>
14
13
  #include <tuple>
15
14
  #include <vector>
@@ -340,8 +339,8 @@ struct ggml_backend_vk_context {
340
339
  size_t semaphore_idx, event_idx;
341
340
  ggml_vk_garbage_collector gc;
342
341
  std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
343
- size_t prealloc_size_qx, prealloc_size_qy, prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
344
- vk_buffer prealloc_qx, prealloc_qy, prealloc_x, prealloc_y, prealloc_split_k;
342
+ size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
343
+ vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
345
344
  vk::Fence fence;
346
345
  vk_buffer staging;
347
346
  size_t staging_size;
@@ -809,7 +808,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
809
808
 
810
809
  static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
811
810
  #ifdef GGML_VULKAN_DEBUG
812
- std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
811
+ std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
813
812
  #endif
814
813
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
815
814
 
@@ -998,6 +997,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
998
997
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
999
998
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
1000
999
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
1000
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
1001
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
1002
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
1003
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
1004
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1001
1005
 
1002
1006
  if (device->fp16) {
1003
1007
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
@@ -1055,6 +1059,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1055
1059
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1056
1060
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1057
1061
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1062
+
1063
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1064
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1065
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1066
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1067
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1068
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1069
+
1070
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1071
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1072
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1073
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1074
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1075
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1076
+
1077
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1078
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1079
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1080
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1081
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1082
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1083
+
1084
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1085
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1086
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1087
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1088
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1089
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1090
+
1091
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1092
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1093
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1094
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1095
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1096
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1058
1097
  } else {
1059
1098
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
1060
1099
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
@@ -1111,6 +1150,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1111
1150
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1112
1151
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1113
1152
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1153
+
1154
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1155
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1156
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1157
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1158
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1159
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1160
+
1161
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1162
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1163
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1164
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1165
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1166
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1167
+
1168
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1169
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1170
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1171
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1172
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1173
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1174
+
1175
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1176
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1177
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1178
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1179
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1180
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1181
+
1182
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1183
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1184
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1185
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1186
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1187
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1114
1188
  }
1115
1189
 
1116
1190
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1139,19 +1213,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1139
1213
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1140
1214
 
1141
1215
  // get_rows
1142
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1143
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1144
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1145
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1146
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1147
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1148
-
1149
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1150
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1151
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1152
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1153
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1154
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1216
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1217
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1218
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1219
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1220
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1221
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1222
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1223
+
1224
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1225
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1226
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1227
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1228
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1229
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1230
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1155
1231
 
1156
1232
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
1157
1233
 
@@ -1341,7 +1417,33 @@ void ggml_vk_instance_init() {
1341
1417
  vk_instance.device_indices.push_back(tmp);
1342
1418
  }
1343
1419
  } else {
1344
- vk_instance.device_indices.push_back(0);
1420
+ std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
1421
+
1422
+ // Make sure at least one device exists
1423
+ if (devices.empty()) {
1424
+ std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
1425
+ GGML_ASSERT(false);
1426
+ }
1427
+
1428
+ // Default to using all dedicated GPUs
1429
+ for (size_t i = 0; i < devices.size(); i++) {
1430
+ vk::PhysicalDeviceProperties props = devices[i].getProperties();
1431
+
1432
+ if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1433
+ vk_instance.device_indices.push_back(i);
1434
+ }
1435
+ }
1436
+
1437
+ // If no dedicated GPUs found, fall back to GPU 0
1438
+ if (vk_instance.device_indices.empty()) {
1439
+ vk_instance.device_indices.push_back(0);
1440
+ }
1441
+ }
1442
+
1443
+ std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
1444
+
1445
+ for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
1446
+ ggml_vk_print_gpu_info(i);
1345
1447
  }
1346
1448
 
1347
1449
  vk_instance_initialized = true;
@@ -1567,6 +1669,15 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
1567
1669
 
1568
1670
  switch (src0_type) {
1569
1671
  case GGML_TYPE_Q4_0:
1672
+ case GGML_TYPE_Q4_1:
1673
+ case GGML_TYPE_Q5_0:
1674
+ case GGML_TYPE_Q5_1:
1675
+ case GGML_TYPE_Q8_0:
1676
+ case GGML_TYPE_Q2_K:
1677
+ case GGML_TYPE_Q3_K:
1678
+ case GGML_TYPE_Q4_K:
1679
+ case GGML_TYPE_Q5_K:
1680
+ case GGML_TYPE_Q6_K:
1570
1681
  break;
1571
1682
  default:
1572
1683
  return nullptr;
@@ -2034,7 +2145,6 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
2034
2145
  ggml_vk_submit(subctx, ctx->fence);
2035
2146
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
2036
2147
  ctx->device->device.resetFences({ ctx->fence });
2037
- ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
2038
2148
  }
2039
2149
  }
2040
2150
 
@@ -2131,7 +2241,6 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
2131
2241
  for (auto& cpy : subctx->out_memcpys) {
2132
2242
  memcpy(cpy.dst, cpy.src, cpy.n);
2133
2243
  }
2134
- ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
2135
2244
  }
2136
2245
  }
2137
2246
 
@@ -2298,6 +2407,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2298
2407
  return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
2299
2408
  case VK_VENDOR_ID_INTEL:
2300
2409
  return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
2410
+ default:
2411
+ break;
2301
2412
  }
2302
2413
 
2303
2414
  if (m <= 32 || n <= 32) {
@@ -2423,11 +2534,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2423
2534
  src1_uma = d_Qy != nullptr;
2424
2535
  }
2425
2536
 
2426
- const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2427
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2428
-
2429
- const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2430
- const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
2537
+ const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
2538
+ const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
2431
2539
 
2432
2540
  const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
2433
2541
 
@@ -2469,16 +2577,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2469
2577
  uint64_t x_buf_offset = 0;
2470
2578
  vk_buffer d_Y;
2471
2579
  uint64_t y_buf_offset = 0;
2472
- if (load_x) {
2473
- d_Qx = ctx->prealloc_qx;
2474
- } else if (!src0_uma) {
2580
+ if (!src0_uma) {
2475
2581
  d_Qx = extra_src0->buffer_gpu.lock();
2476
2582
  qx_buf_offset = extra_src0->offset;
2477
2583
  GGML_ASSERT(d_Qx != nullptr);
2478
2584
  }
2479
- if (load_y) {
2480
- d_Qy = ctx->prealloc_qy;
2481
- } else if (!src1_uma) {
2585
+ if (!src1_uma) {
2482
2586
  d_Qy = extra_src1->buffer_gpu.lock();
2483
2587
  qy_buf_offset = extra_src1->offset;
2484
2588
  GGML_ASSERT(d_Qy != nullptr);
@@ -2530,33 +2634,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2530
2634
 
2531
2635
  if (x_non_contig) {
2532
2636
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
2533
- } else if (load_x || qx_needs_dequant) {
2534
- if (load_x) {
2535
- // copy data to device
2536
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
2537
- ctx->staging_offset = qx_sz * ne02 * ne03;
2538
- }
2539
-
2540
- if (qx_needs_dequant) {
2541
- const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
2542
- ggml_vk_sync_buffers(subctx);
2543
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
2544
- }
2637
+ } else if (qx_needs_dequant) {
2638
+ const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
2639
+ ggml_vk_sync_buffers(subctx);
2640
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
2545
2641
  }
2546
2642
  if (y_non_contig) {
2547
2643
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
2548
- } else if (load_y) {
2549
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
2550
2644
  }
2551
2645
 
2552
2646
  uint32_t stride_batch_x = ne00*ne01;
2553
2647
  uint32_t stride_batch_y = ne10*ne11;
2554
2648
 
2555
- if (!ggml_vk_dim01_contiguous(src0) && !load_x && !qx_needs_dequant) {
2649
+ if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
2556
2650
  stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
2557
2651
  }
2558
2652
 
2559
- if (!ggml_vk_dim01_contiguous(src1) && !load_y && !qy_needs_dequant) {
2653
+ if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
2560
2654
  stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
2561
2655
  }
2562
2656
 
@@ -2616,11 +2710,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2616
2710
  src1_uma = d_Qy != nullptr;
2617
2711
  }
2618
2712
 
2619
- const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2620
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2621
-
2622
- const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2623
- const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
2713
+ const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
2714
+ const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
2624
2715
 
2625
2716
  const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
2626
2717
 
@@ -2644,16 +2735,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2644
2735
  uint64_t x_buf_offset = 0;
2645
2736
  vk_buffer d_Y;
2646
2737
  uint64_t y_buf_offset = 0;
2647
- if (load_x) {
2648
- d_Qx = ctx->prealloc_qx;
2649
- } else if(!src1_uma) {
2738
+ if(!src0_uma) {
2650
2739
  d_Qx = extra_src0->buffer_gpu.lock();
2651
2740
  qx_buf_offset = extra_src0->offset;
2652
2741
  GGML_ASSERT(d_Qx != nullptr);
2653
2742
  }
2654
- if (load_y) {
2655
- d_Qy = ctx->prealloc_qy;
2656
- } else if(!src1_uma) {
2743
+ if(!src1_uma) {
2657
2744
  d_Qy = extra_src1->buffer_gpu.lock();
2658
2745
  qy_buf_offset = extra_src1->offset;
2659
2746
  GGML_ASSERT(d_Qy != nullptr);
@@ -2700,15 +2787,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2700
2787
  if (x_non_contig) {
2701
2788
  GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
2702
2789
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
2703
- } else if (load_x) {
2704
- // copy data to device
2705
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
2706
2790
  }
2707
2791
  if (y_non_contig) {
2708
2792
  GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
2709
2793
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
2710
- } else if (load_y) {
2711
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
2712
2794
  }
2713
2795
 
2714
2796
  for (uint64_t i13 = 0; i13 < ne13; i13++) {
@@ -2789,8 +2871,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2789
2871
  src1_uma = d_Qy != nullptr;
2790
2872
  }
2791
2873
 
2792
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2793
-
2794
2874
  const uint64_t x_ne = ne00 * ne01 * ne02;
2795
2875
  const uint64_t y_ne = ne10 * ne11 * ne12;
2796
2876
  const uint64_t d_ne = ne01 * ne11 * ne12;
@@ -2805,9 +2885,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2805
2885
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
2806
2886
  const uint64_t qx_buf_offset = extra_src0->offset;
2807
2887
  GGML_ASSERT(d_Qx != nullptr);
2808
- if (load_y) {
2809
- d_Qy = ctx->prealloc_qy;
2810
- } else if (!src1_uma) {
2888
+ if (!src1_uma) {
2811
2889
  d_Qy = extra_src1->buffer_gpu.lock();
2812
2890
  qy_buf_offset = extra_src1->offset;
2813
2891
  GGML_ASSERT(d_Qx != nullptr);
@@ -2822,10 +2900,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2822
2900
  const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
2823
2901
  const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
2824
2902
 
2825
- if (load_y) {
2826
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
2827
- }
2828
-
2829
2903
  // compute
2830
2904
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
2831
2905
  ggml_vk_sync_buffers(subctx);
@@ -2881,8 +2955,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2881
2955
  src1_uma = d_Qy != nullptr;
2882
2956
  }
2883
2957
 
2884
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2885
-
2886
2958
  const uint64_t d_ne = ne01 * ne11 * ne12;
2887
2959
 
2888
2960
  const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
@@ -2898,9 +2970,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2898
2970
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
2899
2971
  const uint64_t qx_buf_offset = extra_src0->offset;
2900
2972
  GGML_ASSERT(d_Qx != nullptr);
2901
- if (load_y) {
2902
- d_Qy = ctx->prealloc_qy;
2903
- } else {
2973
+ if (!src1_uma) {
2904
2974
  d_Qy = extra_src1->buffer_gpu.lock();
2905
2975
  qy_buf_offset = extra_src1->offset;
2906
2976
  GGML_ASSERT(d_Qx != nullptr);
@@ -2915,10 +2985,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2915
2985
  const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
2916
2986
  const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
2917
2987
 
2918
- if (load_y) {
2919
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
2920
- }
2921
-
2922
2988
  // compute
2923
2989
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
2924
2990
  ggml_vk_sync_buffers(subctx);
@@ -3174,7 +3240,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3174
3240
  }
3175
3241
  std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3176
3242
  #endif
3177
- GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT
3243
+ GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3178
3244
  GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
3179
3245
  GGML_ASSERT(dst->extra != nullptr);
3180
3246
  const uint64_t ne00 = src0->ne[0];
@@ -3242,11 +3308,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3242
3308
  }
3243
3309
  }
3244
3310
 
3245
- const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
3246
- const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
3247
- const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma;
3248
-
3249
- uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3311
+ uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3250
3312
  uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
3251
3313
  uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
3252
3314
  uint64_t d_sz = ggml_type_size(dst->type) * ne0;
@@ -3261,55 +3323,43 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3261
3323
  GGML_ASSERT(d_D != nullptr);
3262
3324
  uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
3263
3325
  GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
3264
- if (transfer_src0) {
3265
- d_X = ctx->prealloc_qx;
3266
- } else if(!src0_uma) {
3326
+ if(!src0_uma) {
3267
3327
  d_X = extra_src0->buffer_gpu.lock();
3268
3328
  x_buf_offset = extra_src0->offset;
3269
3329
  GGML_ASSERT(d_X != nullptr);
3270
3330
  }
3271
- if (transfer_src1) {
3272
- d_Y = ctx->prealloc_qy;
3273
- } else if (use_src1 && !src1_uma) {
3331
+ if (use_src1 && !src1_uma) {
3274
3332
  d_Y = extra_src1->buffer_gpu.lock();
3275
3333
  y_buf_offset = extra_src1->offset;
3276
3334
  GGML_ASSERT(d_Y != nullptr);
3277
3335
  }
3278
3336
 
3279
- GGML_ASSERT(!transfer_src2);
3280
3337
  if (use_src2 && !src2_uma) {
3281
3338
  d_Z = extra_src2->buffer_gpu.lock();
3282
3339
  z_buf_offset = extra_src2->offset;
3283
3340
  GGML_ASSERT(d_Z != nullptr);
3284
3341
  }
3285
3342
 
3286
- if (op == GGML_OP_CPY) {
3287
- GGML_ASSERT(!transfer_src0);
3288
- GGML_ASSERT(!transfer_src1);
3343
+ if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
3289
3344
  x_sz = ggml_nbytes(src0);
3345
+ y_sz = use_src1 ? ggml_nbytes(src1) : 0;
3290
3346
  d_sz = ggml_nbytes(dst);
3291
3347
 
3292
- if (extra_src0->offset + x_sz >= d_X->size) {
3348
+ if (x_buf_offset + x_sz >= d_X->size) {
3293
3349
  x_sz = VK_WHOLE_SIZE;
3294
3350
  }
3295
- if (extra->offset + d_sz >= d_D->size) {
3351
+ if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
3352
+ y_sz = VK_WHOLE_SIZE;
3353
+ }
3354
+ if (d_buf_offset + d_sz >= d_D->size) {
3296
3355
  d_sz = VK_WHOLE_SIZE;
3297
3356
  }
3298
3357
  }
3299
3358
 
3300
3359
  std::array<uint32_t, 3> elements;
3301
3360
 
3302
- // copy src0 to device
3303
- if (transfer_src0) {
3304
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
3305
- ctx->staging_offset = x_sz * ne02 * ne03;
3306
- }
3307
- if (transfer_src1) {
3308
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
3309
- }
3310
-
3311
3361
  // Single call if dimension 2 is contiguous
3312
- if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
3362
+ if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
3313
3363
  ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
3314
3364
 
3315
3365
  switch (dst->op) {
@@ -3322,16 +3372,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3322
3372
  case GGML_OP_ROPE:
3323
3373
  elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
3324
3374
  break;
3375
+ case GGML_OP_GET_ROWS:
3376
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
3377
+ break;
3325
3378
  default:
3326
3379
  elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
3327
3380
  break;
3328
3381
  }
3329
3382
 
3330
- if (op != GGML_OP_CPY) {
3383
+ if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
3331
3384
  if (x_sz != VK_WHOLE_SIZE) {
3332
3385
  x_sz *= ne02 * ne03;
3333
3386
  }
3334
- if (y_sz != VK_WHOLE_SIZE) {
3387
+ if (use_src1 && y_sz != VK_WHOLE_SIZE) {
3335
3388
  y_sz *= ne12 * ne13;
3336
3389
  }
3337
3390
  if (d_sz != VK_WHOLE_SIZE) {
@@ -3386,6 +3439,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3386
3439
  case GGML_OP_ROPE:
3387
3440
  elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
3388
3441
  break;
3442
+ case GGML_OP_GET_ROWS:
3443
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
3444
+ break;
3389
3445
  default:
3390
3446
  elements = { (uint32_t)ne0, 1, 1 };
3391
3447
  break;
@@ -3420,7 +3476,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c
3420
3476
  }
3421
3477
 
3422
3478
  static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3423
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
3479
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
3480
+ const uint32_t src1_type_size = ggml_type_size(src1->type);
3481
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
3482
+
3483
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
3484
+ (uint32_t)ggml_nelements(src0),
3485
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
3486
+ (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
3487
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
3488
+ 0,
3489
+ 0.0f, 0.0f,
3490
+ });
3424
3491
  }
3425
3492
 
3426
3493
  static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3576,9 +3643,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
3576
3643
  if (is_neox) {
3577
3644
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
3578
3645
  const float inv_ndims = -1.0f / n_dims;
3579
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
3646
+ ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
3580
3647
  } else {
3581
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
3648
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
3582
3649
  }
3583
3650
  }
3584
3651
 
@@ -3587,16 +3654,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
3587
3654
  ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
3588
3655
  }
3589
3656
 
3590
- static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
3591
- // If backend is CPU, data from src0 has to be copied off the device
3592
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3593
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3594
- vk_buffer d_D = extra_src0->buffer_gpu.lock();
3595
- ggml_vk_sync_buffers(subctx);
3596
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size);
3597
- }
3598
- }
3599
-
3600
3657
  #ifdef GGML_VULKAN_RUN_TESTS
3601
3658
  static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
3602
3659
  if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
@@ -3619,6 +3676,8 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
3619
3676
  val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
3620
3677
  } else if (type == GGML_TYPE_F16) {
3621
3678
  val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
3679
+ } else {
3680
+ GGML_ASSERT(false);
3622
3681
  }
3623
3682
  fprintf(stderr, "% 7.2f ", val);
3624
3683
  } else {
@@ -3920,6 +3979,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
3920
3979
  val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
3921
3980
  } else if (tensor->type == GGML_TYPE_F16) {
3922
3981
  val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
3982
+ } else {
3983
+ GGML_ASSERT(false);
3923
3984
  }
3924
3985
  fprintf(stderr, "% 7.2f ", val);
3925
3986
  } else {
@@ -4335,7 +4396,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
4335
4396
 
4336
4397
  std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
4337
4398
 
4338
- if (avg_err > 0.1 || std::isnan(avg_err)) {
4399
+ if (avg_err > 0.01 || std::isnan(avg_err)) {
4339
4400
  std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
4340
4401
  std::cerr << "Actual result: " << std::endl << std::endl;
4341
4402
  ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@@ -4385,27 +4446,15 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
4385
4446
  return extra;
4386
4447
  }
4387
4448
 
4388
- static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) {
4389
- return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID;
4390
- }
4391
-
4392
4449
  static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
4393
4450
  #ifdef GGML_VULKAN_DEBUG
4394
4451
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
4395
4452
  #endif
4396
- const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4397
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4398
- || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
4399
-
4400
- if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
4453
+ if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
4401
4454
  return;
4402
4455
  }
4403
4456
 
4404
4457
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
4405
- if (extra == nullptr) {
4406
- // Workaround for CPU backend BLAS matmul calls
4407
- extra = ggml_vk_tensor_create_extra(node);
4408
- }
4409
4458
 
4410
4459
  ggml_tensor * src0 = node->src[0];
4411
4460
  ggml_tensor * src1 = node->src[1];
@@ -4425,7 +4474,18 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
4425
4474
  const int64_t ne22 = node->ne[2];
4426
4475
  const int64_t ne23 = node->ne[3];
4427
4476
 
4428
- const bool f16_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32;
4477
+ const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
4478
+ const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
4479
+
4480
+ const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
4481
+ const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
4482
+
4483
+ const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
4484
+
4485
+ bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
4486
+
4487
+ const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
4488
+ const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
4429
4489
 
4430
4490
  int split_k;
4431
4491
  if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
@@ -4437,10 +4497,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
4437
4497
  const uint32_t y_ne = ne10 * ne11;
4438
4498
  const uint32_t d_ne = ne20 * ne21;
4439
4499
 
4440
- const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
4441
- const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
4442
- const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
4443
- const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
4500
+ const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
4501
+ const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
4444
4502
  uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
4445
4503
  const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
4446
4504
 
@@ -4483,12 +4541,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
4483
4541
  break;
4484
4542
  case GGML_OP_MUL_MAT:
4485
4543
  case GGML_OP_MUL_MAT_ID:
4486
- if (ctx->prealloc_size_qx < qx_sz) {
4487
- ctx->prealloc_size_qx = qx_sz;
4488
- }
4489
- if (ctx->prealloc_size_qy < qy_sz) {
4490
- ctx->prealloc_size_qy = qy_sz;
4491
- }
4492
4544
  if (ctx->prealloc_size_x < x_sz) {
4493
4545
  ctx->prealloc_size_x = x_sz;
4494
4546
  }
@@ -4512,7 +4564,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4512
4564
  return;
4513
4565
  }
4514
4566
  #ifdef GGML_VULKAN_DEBUG
4515
- std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
4567
+ std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
4516
4568
  #endif
4517
4569
  #if defined(GGML_VULKAN_RUN_TESTS)
4518
4570
  ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
@@ -4575,6 +4627,41 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4575
4627
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
4576
4628
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
4577
4629
 
4630
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
4631
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
4632
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
4633
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
4634
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
4635
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
4636
+
4637
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
4638
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
4639
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
4640
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
4641
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
4642
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
4643
+
4644
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
4645
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
4646
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
4647
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
4648
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
4649
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
4650
+
4651
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
4652
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
4653
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
4654
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
4655
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
4656
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
4657
+
4658
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
4659
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
4660
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
4661
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
4662
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
4663
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
4664
+
4578
4665
  std::cerr << std::endl;
4579
4666
 
4580
4667
  const std::vector<size_t> vals {
@@ -4614,20 +4701,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4614
4701
  GGML_ASSERT(false);
4615
4702
  #endif
4616
4703
 
4617
- if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) {
4618
- // Resize buffer
4619
- if (ctx->prealloc_qx != nullptr) {
4620
- ggml_vk_destroy_buffer(ctx->prealloc_qx);
4621
- }
4622
- ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx);
4623
- }
4624
- if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) {
4625
- // Resize buffer
4626
- if (ctx->prealloc_qy != nullptr) {
4627
- ggml_vk_destroy_buffer(ctx->prealloc_qy);
4628
- }
4629
- ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy);
4630
- }
4631
4704
  if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
4632
4705
  // Resize buffer
4633
4706
  if (ctx->prealloc_x != nullptr) {
@@ -4661,11 +4734,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4661
4734
  }
4662
4735
 
4663
4736
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
4664
- const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4665
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4666
- || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4667
-
4668
- if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
4737
+ if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
4669
4738
  return;
4670
4739
  }
4671
4740
 
@@ -4693,7 +4762,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4693
4762
  }
4694
4763
  break;
4695
4764
  case GGML_OP_REPEAT:
4696
- // case GGML_OP_GET_ROWS:
4765
+ case GGML_OP_GET_ROWS:
4697
4766
  case GGML_OP_ADD:
4698
4767
  case GGML_OP_MUL:
4699
4768
  case GGML_OP_SCALE:
@@ -4717,10 +4786,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4717
4786
  case GGML_OP_ARGSORT:
4718
4787
  break;
4719
4788
  default:
4720
- if (any_on_device) {
4721
- std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
4722
- GGML_ASSERT(false);
4723
- }
4789
+ std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
4790
+ GGML_ASSERT(false);
4724
4791
  return;
4725
4792
  }
4726
4793
 
@@ -4769,8 +4836,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4769
4836
  case GGML_OP_PERMUTE:
4770
4837
  case GGML_OP_TRANSPOSE:
4771
4838
  case GGML_OP_NONE:
4772
- ggml_vk_nop(ctx, ctx->compute_ctx, src0, node);
4773
-
4774
4839
  break;
4775
4840
  case GGML_OP_NORM:
4776
4841
  ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@@ -4837,11 +4902,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4837
4902
  }
4838
4903
 
4839
4904
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
4840
- const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
4841
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4842
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4843
-
4844
- if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
4905
+ if (ctx->disable) {
4845
4906
  return false;
4846
4907
  }
4847
4908
 
@@ -4884,10 +4945,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
4884
4945
  break;
4885
4946
  case GGML_OP_MUL_MAT:
4886
4947
  case GGML_OP_MUL_MAT_ID:
4887
- if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
4888
- return false;
4889
- }
4890
-
4891
4948
  extra = (ggml_tensor_extra_gpu *) tensor->extra;
4892
4949
 
4893
4950
  break;
@@ -5001,8 +5058,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5001
5058
  #endif
5002
5059
  ggml_vk_graph_cleanup(ctx);
5003
5060
 
5004
- ggml_vk_destroy_buffer(ctx->prealloc_qx);
5005
- ggml_vk_destroy_buffer(ctx->prealloc_qy);
5006
5061
  ggml_vk_destroy_buffer(ctx->prealloc_x);
5007
5062
  ggml_vk_destroy_buffer(ctx->prealloc_y);
5008
5063
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
@@ -5013,8 +5068,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5013
5068
  ggml_vk_destroy_buffer(buffer);
5014
5069
  }
5015
5070
 
5016
- ctx->prealloc_size_qx = 0;
5017
- ctx->prealloc_size_qy = 0;
5018
5071
  ctx->prealloc_size_x = 0;
5019
5072
  ctx->prealloc_size_y = 0;
5020
5073
  ctx->prealloc_size_split_k = 0;
@@ -5045,80 +5098,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
5045
5098
  snprintf(description, description_size, "%s", props.deviceName.data());
5046
5099
  }
5047
5100
 
5048
- // CPU assist interface
5049
-
5050
- void ggml_vk_init_cpu_assist() {
5051
- ggml_vk_instance_init();
5052
-
5053
- std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
5054
-
5055
- for (int i = 0; i < ggml_vk_get_device_count(); i++) {
5056
- ggml_vk_print_gpu_info(i);
5057
- }
5058
- // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
5059
- ggml_backend_vk_init(0);
5060
- }
5061
-
5062
- void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
5063
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5064
-
5065
- if (!ctx->initialized) {
5066
- return;
5067
- }
5068
-
5069
- ggml_vk_preallocate_buffers_graph(ctx, node);
5070
- }
5071
-
5072
- void ggml_vk_preallocate_buffers_cpu_assist() {
5073
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5074
-
5075
- if (!ctx->initialized) {
5076
- return;
5077
- }
5078
-
5079
- ggml_vk_preallocate_buffers(ctx);
5080
- }
5081
-
5082
- void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
5083
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5084
-
5085
- if (!ctx->initialized) {
5086
- return;
5087
- }
5088
-
5089
- ggml_vk_build_graph(ctx, node, last_node);
5090
- }
5091
-
5092
- bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
5093
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5094
-
5095
- if (!ctx->initialized) {
5096
- return false;
5097
- }
5098
-
5099
- return ggml_vk_compute_forward(ctx, params, tensor);
5100
- }
5101
-
5102
- void ggml_vk_graph_cleanup_cpu_assist() {
5103
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5104
-
5105
- if (!ctx->initialized) {
5106
- return;
5107
- }
5108
-
5109
- ggml_vk_graph_cleanup(ctx);
5110
- }
5111
-
5112
- void ggml_vk_free_cpu_assist() {
5113
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5114
-
5115
- if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
5116
- return;
5117
- }
5118
-
5119
- ggml_backend_vk_free(vk_instance.backends[0]);
5120
- }
5121
-
5122
5101
  // backend interface
5123
5102
 
5124
5103
  #define UNUSED GGML_UNUSED
@@ -5330,16 +5309,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
5330
5309
  /* .is_host = */ NULL,
5331
5310
  };
5332
5311
 
5333
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t idx) {
5312
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
5334
5313
  #ifdef GGML_VULKAN_DEBUG
5335
- std::cerr << "ggml_backend_vk_buffer_type(" << idx << ")" << std::endl;
5314
+ std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
5336
5315
  #endif
5337
5316
 
5338
- GGML_ASSERT(idx < vk_instance.device_indices.size());
5317
+ GGML_ASSERT(dev_num < vk_instance.device_indices.size());
5339
5318
 
5340
- ggml_backend_vk_init(idx);
5319
+ ggml_backend_vk_init(dev_num);
5341
5320
 
5342
- return &vk_instance.buffer_types[idx];
5321
+ return &vk_instance.buffer_types[dev_num];
5343
5322
  }
5344
5323
 
5345
5324
  // host buffer type
@@ -5508,7 +5487,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
5508
5487
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
5509
5488
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
5510
5489
 
5511
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, src_buf, src_extra->offset, dst_buf, dst_extra->offset, ggml_nbytes(src));
5490
+ ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
5512
5491
  return true;
5513
5492
  }
5514
5493
 
@@ -5542,6 +5521,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
5542
5521
  }
5543
5522
 
5544
5523
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
5524
+ #ifdef GGML_VULKAN_DEBUG
5525
+ std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
5526
+ #endif
5545
5527
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
5546
5528
 
5547
5529
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -5566,7 +5548,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
5566
5548
  for (int i = 0; i < cgraph->n_nodes; i++) {
5567
5549
  ggml_tensor * node = cgraph->nodes[i];
5568
5550
 
5569
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
5551
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
5570
5552
  continue;
5571
5553
  }
5572
5554
 
@@ -5602,8 +5584,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5602
5584
  }
5603
5585
  break;
5604
5586
  case GGML_OP_MUL_MAT:
5605
- case GGML_OP_MUL_MAT_ID:
5587
+ // case GGML_OP_MUL_MAT_ID:
5606
5588
  {
5589
+ switch (op->src[0]->type) {
5590
+ case GGML_TYPE_F32:
5591
+ case GGML_TYPE_F16:
5592
+ case GGML_TYPE_Q4_0:
5593
+ case GGML_TYPE_Q4_1:
5594
+ case GGML_TYPE_Q5_0:
5595
+ case GGML_TYPE_Q5_1:
5596
+ case GGML_TYPE_Q8_0:
5597
+ case GGML_TYPE_Q2_K:
5598
+ case GGML_TYPE_Q3_K:
5599
+ case GGML_TYPE_Q4_K:
5600
+ case GGML_TYPE_Q5_K:
5601
+ case GGML_TYPE_Q6_K:
5602
+ break;
5603
+ default:
5604
+ return false;
5605
+ }
5607
5606
  struct ggml_tensor * a;
5608
5607
  struct ggml_tensor * b;
5609
5608
  if (op->op == GGML_OP_MUL_MAT) {
@@ -5618,25 +5617,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5618
5617
  }
5619
5618
  return true;
5620
5619
  } break;
5621
- // case GGML_OP_GET_ROWS:
5622
- // {
5623
- // switch (op->src[0]->type) {
5624
- // case GGML_TYPE_F16:
5625
- // case GGML_TYPE_F32:
5626
- // case GGML_TYPE_Q4_0:
5627
- // case GGML_TYPE_Q4_1:
5628
- // case GGML_TYPE_Q5_0:
5629
- // case GGML_TYPE_Q5_1:
5630
- // case GGML_TYPE_Q8_0:
5631
- // return true;
5632
- // default:
5633
- // return false;
5634
- // }
5635
- // } break;
5620
+ case GGML_OP_GET_ROWS:
5621
+ {
5622
+ switch (op->src[0]->type) {
5623
+ case GGML_TYPE_F32:
5624
+ case GGML_TYPE_F16:
5625
+ case GGML_TYPE_Q4_0:
5626
+ case GGML_TYPE_Q4_1:
5627
+ case GGML_TYPE_Q5_0:
5628
+ case GGML_TYPE_Q5_1:
5629
+ case GGML_TYPE_Q8_0:
5630
+ return true;
5631
+ default:
5632
+ return false;
5633
+ }
5634
+ } break;
5636
5635
  case GGML_OP_CPY:
5636
+ case GGML_OP_DUP:
5637
5637
  {
5638
5638
  ggml_type src0_type = op->src[0]->type;
5639
- ggml_type src1_type = op->src[1]->type;
5639
+ ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
5640
5640
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
5641
5641
  return true;
5642
5642
  }
@@ -5648,7 +5648,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5648
5648
  }
5649
5649
  return false;
5650
5650
  } break;
5651
- case GGML_OP_DUP:
5652
5651
  // case GGML_OP_REPEAT:
5653
5652
  // {
5654
5653
  // ggml_type src0_type = op->src[0]->type;
@@ -5685,6 +5684,20 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5685
5684
  UNUSED(backend);
5686
5685
  }
5687
5686
 
5687
+ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
5688
+ const ggml_tensor * dst = op;
5689
+
5690
+ const int min_batch_size = 32;
5691
+
5692
+ if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
5693
+ return true;
5694
+ }
5695
+
5696
+ return false;
5697
+
5698
+ UNUSED(backend);
5699
+ }
5700
+
5688
5701
  // TODO: enable async and synchronize
5689
5702
  static ggml_backend_i ggml_backend_vk_interface = {
5690
5703
  /* .get_name = */ ggml_backend_vk_name,
@@ -5699,7 +5712,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
5699
5712
  /* .graph_plan_compute = */ NULL,
5700
5713
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
5701
5714
  /* .supports_op = */ ggml_backend_vk_supports_op,
5702
- /* .offload_op = */ NULL,
5715
+ /* .offload_op = */ ggml_backend_vk_offload_op,
5703
5716
  /* .event_new = */ NULL,
5704
5717
  /* .event_free = */ NULL,
5705
5718
  /* .event_record = */ NULL,
@@ -5712,22 +5725,22 @@ static ggml_guid_t ggml_backend_vk_guid() {
5712
5725
  return &guid;
5713
5726
  }
5714
5727
 
5715
- GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5716
- if (vk_instance.initialized[idx]) {
5717
- return vk_instance.backends[idx];
5728
+ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
5729
+ if (vk_instance.initialized[dev_num]) {
5730
+ return vk_instance.backends[dev_num];
5718
5731
  }
5719
5732
  #ifdef GGML_VULKAN_DEBUG
5720
- std::cerr << "ggml_backend_vk_init(" << idx << ")" << std::endl;
5733
+ std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
5721
5734
  #endif
5722
5735
 
5723
- ggml_backend_vk_context * ctx = &vk_instance.contexts[idx];
5724
- ggml_vk_init(ctx, idx);
5725
- ctx->name = GGML_VK_NAME + std::to_string(idx);
5726
- vk_instance.buffer_types[idx] = {
5736
+ ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
5737
+ ggml_vk_init(ctx, dev_num);
5738
+ ctx->name = GGML_VK_NAME + std::to_string(dev_num);
5739
+ vk_instance.buffer_types[dev_num] = {
5727
5740
  /* .iface = */ ggml_backend_vk_buffer_type_interface,
5728
5741
  /* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
5729
5742
  };
5730
- vk_instance.initialized[idx] = true;
5743
+ vk_instance.initialized[dev_num] = true;
5731
5744
 
5732
5745
  ggml_backend_t vk_backend = new ggml_backend {
5733
5746
  /* .guid = */ ggml_backend_vk_guid(),
@@ -5735,7 +5748,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5735
5748
  /* .context = */ &vk_instance.contexts[ctx->idx],
5736
5749
  };
5737
5750
 
5738
- vk_instance.backends[idx] = vk_backend;
5751
+ vk_instance.backends[dev_num] = vk_backend;
5739
5752
 
5740
5753
  return vk_backend;
5741
5754
  }
@@ -5779,10 +5792,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo
5779
5792
  extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
5780
5793
 
5781
5794
  GGML_CALL int ggml_backend_vk_reg_devices() {
5782
- for (auto idx : vk_instance.device_indices) {
5795
+ ggml_vk_instance_init();
5796
+
5797
+ for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
5783
5798
  char name[128];
5784
- snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, idx);
5785
- ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(idx), (void *) (intptr_t) idx);
5799
+ snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
5800
+ ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
5786
5801
  }
5787
5802
  return vk_instance.device_indices.size();
5788
5803
  }
@@ -5866,6 +5881,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
5866
5881
  val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
5867
5882
  } else if (tensor->type == GGML_TYPE_F16) {
5868
5883
  val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
5884
+ } else {
5885
+ GGML_ASSERT(false);
5869
5886
  }
5870
5887
  fprintf(stderr, "% 7.2f ", val);
5871
5888
  } else {
@@ -5960,6 +5977,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5960
5977
  return;
5961
5978
  }
5962
5979
 
5980
+ #ifdef GGML_VULKAN_DEBUG
5981
+ std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
5982
+ #endif
5983
+
5963
5984
  ggml_tensor * src0 = tensor->src[0];
5964
5985
  ggml_tensor * src1 = tensor->src[1];
5965
5986
  ggml_tensor * src2 = tensor->src[2];
@@ -6219,6 +6240,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6219
6240
  tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
6220
6241
  } else if (tensor->op == GGML_OP_TRANSPOSE) {
6221
6242
  tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
6243
+ } else if (tensor->op == GGML_OP_GET_ROWS) {
6244
+ tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
6222
6245
  } else {
6223
6246
  std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6224
6247
  GGML_ASSERT(false);
@@ -6269,6 +6292,10 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6269
6292
  return;
6270
6293
  }
6271
6294
 
6295
+ #ifdef GGML_VULKAN_DEBUG
6296
+ std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
6297
+ #endif
6298
+
6272
6299
  ggml_tensor * src0 = tensor->src[0];
6273
6300
  ggml_tensor * src1 = tensor->src[1];
6274
6301
 
@@ -6412,10 +6439,4 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6412
6439
  free(tensor_data);
6413
6440
  }
6414
6441
  }
6415
-
6416
- void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
6417
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
6418
-
6419
- ggml_vk_check_results_0(ctx, params, tensor);
6420
- }
6421
6442
  #endif