llama_cpp 0.14.2 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,7 +9,6 @@
9
9
  #include <algorithm>
10
10
  #include <cmath>
11
11
  #include <iostream>
12
- #include <iomanip>
13
12
  #include <limits>
14
13
  #include <tuple>
15
14
  #include <vector>
@@ -340,8 +339,8 @@ struct ggml_backend_vk_context {
340
339
  size_t semaphore_idx, event_idx;
341
340
  ggml_vk_garbage_collector gc;
342
341
  std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
343
- size_t prealloc_size_qx, prealloc_size_qy, prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
344
- vk_buffer prealloc_qx, prealloc_qy, prealloc_x, prealloc_y, prealloc_split_k;
342
+ size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
343
+ vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
345
344
  vk::Fence fence;
346
345
  vk_buffer staging;
347
346
  size_t staging_size;
@@ -710,6 +709,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
710
709
  }
711
710
  }
712
711
 
712
+ // All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
713
+ // Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
714
+ if (compute_index >= 0) {
715
+ return compute_index;
716
+ }
717
+
713
718
  std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
714
719
 
715
720
  for(auto &q_family : queue_family_props) {
@@ -803,7 +808,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
803
808
 
804
809
  static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
805
810
  #ifdef GGML_VULKAN_DEBUG
806
- std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
811
+ std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
807
812
  #endif
808
813
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
809
814
 
@@ -992,6 +997,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
992
997
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
993
998
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
994
999
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
1000
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
1001
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
1002
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
1003
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
1004
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
995
1005
 
996
1006
  if (device->fp16) {
997
1007
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
@@ -1049,6 +1059,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1049
1059
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1050
1060
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1051
1061
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1062
+
1063
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1064
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1065
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1066
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1067
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1068
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1069
+
1070
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1071
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1072
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1073
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1074
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1075
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1076
+
1077
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1078
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1079
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1080
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1081
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1082
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1083
+
1084
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1085
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1086
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1087
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1088
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1089
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1090
+
1091
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1092
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1093
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1094
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1095
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1096
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1052
1097
  } else {
1053
1098
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
1054
1099
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
@@ -1105,6 +1150,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1105
1150
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1106
1151
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1107
1152
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1153
+
1154
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1155
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1156
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1157
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1158
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1159
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1160
+
1161
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1162
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1163
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1164
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1165
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1166
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1167
+
1168
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1169
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1170
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1171
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1172
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1173
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1174
+
1175
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1176
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1177
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1178
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1179
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1180
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1181
+
1182
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1183
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1184
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1185
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1186
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1187
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1108
1188
  }
1109
1189
 
1110
1190
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1133,19 +1213,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1133
1213
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1134
1214
 
1135
1215
  // get_rows
1136
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1137
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1138
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1139
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1140
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1141
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1142
-
1143
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1144
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1145
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1146
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1147
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1148
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1216
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1217
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1218
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1219
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1220
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1221
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1222
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1223
+
1224
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1225
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1226
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1227
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1228
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1229
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1230
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1149
1231
 
1150
1232
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
1151
1233
 
@@ -1335,7 +1417,33 @@ void ggml_vk_instance_init() {
1335
1417
  vk_instance.device_indices.push_back(tmp);
1336
1418
  }
1337
1419
  } else {
1338
- vk_instance.device_indices.push_back(0);
1420
+ std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
1421
+
1422
+ // Make sure at least one device exists
1423
+ if (devices.empty()) {
1424
+ std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
1425
+ GGML_ASSERT(false);
1426
+ }
1427
+
1428
+ // Default to using all dedicated GPUs
1429
+ for (size_t i = 0; i < devices.size(); i++) {
1430
+ vk::PhysicalDeviceProperties props = devices[i].getProperties();
1431
+
1432
+ if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1433
+ vk_instance.device_indices.push_back(i);
1434
+ }
1435
+ }
1436
+
1437
+ // If no dedicated GPUs found, fall back to GPU 0
1438
+ if (vk_instance.device_indices.empty()) {
1439
+ vk_instance.device_indices.push_back(0);
1440
+ }
1441
+ }
1442
+
1443
+ std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
1444
+
1445
+ for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
1446
+ ggml_vk_print_gpu_info(i);
1339
1447
  }
1340
1448
 
1341
1449
  vk_instance_initialized = true;
@@ -1561,6 +1669,15 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
1561
1669
 
1562
1670
  switch (src0_type) {
1563
1671
  case GGML_TYPE_Q4_0:
1672
+ case GGML_TYPE_Q4_1:
1673
+ case GGML_TYPE_Q5_0:
1674
+ case GGML_TYPE_Q5_1:
1675
+ case GGML_TYPE_Q8_0:
1676
+ case GGML_TYPE_Q2_K:
1677
+ case GGML_TYPE_Q3_K:
1678
+ case GGML_TYPE_Q4_K:
1679
+ case GGML_TYPE_Q5_K:
1680
+ case GGML_TYPE_Q6_K:
1564
1681
  break;
1565
1682
  default:
1566
1683
  return nullptr;
@@ -2028,7 +2145,6 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
2028
2145
  ggml_vk_submit(subctx, ctx->fence);
2029
2146
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
2030
2147
  ctx->device->device.resetFences({ ctx->fence });
2031
- ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
2032
2148
  }
2033
2149
  }
2034
2150
 
@@ -2125,7 +2241,6 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
2125
2241
  for (auto& cpy : subctx->out_memcpys) {
2126
2242
  memcpy(cpy.dst, cpy.src, cpy.n);
2127
2243
  }
2128
- ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
2129
2244
  }
2130
2245
  }
2131
2246
 
@@ -2292,6 +2407,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2292
2407
  return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
2293
2408
  case VK_VENDOR_ID_INTEL:
2294
2409
  return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
2410
+ default:
2411
+ break;
2295
2412
  }
2296
2413
 
2297
2414
  if (m <= 32 || n <= 32) {
@@ -2417,11 +2534,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2417
2534
  src1_uma = d_Qy != nullptr;
2418
2535
  }
2419
2536
 
2420
- const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2421
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2422
-
2423
- const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2424
- const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
2537
+ const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
2538
+ const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
2425
2539
 
2426
2540
  const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
2427
2541
 
@@ -2463,16 +2577,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2463
2577
  uint64_t x_buf_offset = 0;
2464
2578
  vk_buffer d_Y;
2465
2579
  uint64_t y_buf_offset = 0;
2466
- if (load_x) {
2467
- d_Qx = ctx->prealloc_qx;
2468
- } else if (!src0_uma) {
2580
+ if (!src0_uma) {
2469
2581
  d_Qx = extra_src0->buffer_gpu.lock();
2470
2582
  qx_buf_offset = extra_src0->offset;
2471
2583
  GGML_ASSERT(d_Qx != nullptr);
2472
2584
  }
2473
- if (load_y) {
2474
- d_Qy = ctx->prealloc_qy;
2475
- } else if (!src1_uma) {
2585
+ if (!src1_uma) {
2476
2586
  d_Qy = extra_src1->buffer_gpu.lock();
2477
2587
  qy_buf_offset = extra_src1->offset;
2478
2588
  GGML_ASSERT(d_Qy != nullptr);
@@ -2524,33 +2634,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2524
2634
 
2525
2635
  if (x_non_contig) {
2526
2636
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
2527
- } else if (load_x || qx_needs_dequant) {
2528
- if (load_x) {
2529
- // copy data to device
2530
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
2531
- ctx->staging_offset = qx_sz * ne02 * ne03;
2532
- }
2533
-
2534
- if (qx_needs_dequant) {
2535
- const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
2536
- ggml_vk_sync_buffers(subctx);
2537
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
2538
- }
2637
+ } else if (qx_needs_dequant) {
2638
+ const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
2639
+ ggml_vk_sync_buffers(subctx);
2640
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
2539
2641
  }
2540
2642
  if (y_non_contig) {
2541
2643
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
2542
- } else if (load_y) {
2543
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
2544
2644
  }
2545
2645
 
2546
2646
  uint32_t stride_batch_x = ne00*ne01;
2547
2647
  uint32_t stride_batch_y = ne10*ne11;
2548
2648
 
2549
- if (!ggml_vk_dim01_contiguous(src0) && !load_x && !qx_needs_dequant) {
2649
+ if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
2550
2650
  stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
2551
2651
  }
2552
2652
 
2553
- if (!ggml_vk_dim01_contiguous(src1) && !load_y && !qy_needs_dequant) {
2653
+ if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
2554
2654
  stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
2555
2655
  }
2556
2656
 
@@ -2610,11 +2710,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2610
2710
  src1_uma = d_Qy != nullptr;
2611
2711
  }
2612
2712
 
2613
- const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2614
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2615
-
2616
- const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2617
- const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
2713
+ const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
2714
+ const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
2618
2715
 
2619
2716
  const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
2620
2717
 
@@ -2638,16 +2735,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2638
2735
  uint64_t x_buf_offset = 0;
2639
2736
  vk_buffer d_Y;
2640
2737
  uint64_t y_buf_offset = 0;
2641
- if (load_x) {
2642
- d_Qx = ctx->prealloc_qx;
2643
- } else if(!src1_uma) {
2738
+ if(!src0_uma) {
2644
2739
  d_Qx = extra_src0->buffer_gpu.lock();
2645
2740
  qx_buf_offset = extra_src0->offset;
2646
2741
  GGML_ASSERT(d_Qx != nullptr);
2647
2742
  }
2648
- if (load_y) {
2649
- d_Qy = ctx->prealloc_qy;
2650
- } else if(!src1_uma) {
2743
+ if(!src1_uma) {
2651
2744
  d_Qy = extra_src1->buffer_gpu.lock();
2652
2745
  qy_buf_offset = extra_src1->offset;
2653
2746
  GGML_ASSERT(d_Qy != nullptr);
@@ -2694,15 +2787,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2694
2787
  if (x_non_contig) {
2695
2788
  GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
2696
2789
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
2697
- } else if (load_x) {
2698
- // copy data to device
2699
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
2700
2790
  }
2701
2791
  if (y_non_contig) {
2702
2792
  GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
2703
2793
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
2704
- } else if (load_y) {
2705
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
2706
2794
  }
2707
2795
 
2708
2796
  for (uint64_t i13 = 0; i13 < ne13; i13++) {
@@ -2783,8 +2871,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2783
2871
  src1_uma = d_Qy != nullptr;
2784
2872
  }
2785
2873
 
2786
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2787
-
2788
2874
  const uint64_t x_ne = ne00 * ne01 * ne02;
2789
2875
  const uint64_t y_ne = ne10 * ne11 * ne12;
2790
2876
  const uint64_t d_ne = ne01 * ne11 * ne12;
@@ -2799,9 +2885,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2799
2885
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
2800
2886
  const uint64_t qx_buf_offset = extra_src0->offset;
2801
2887
  GGML_ASSERT(d_Qx != nullptr);
2802
- if (load_y) {
2803
- d_Qy = ctx->prealloc_qy;
2804
- } else if (!src1_uma) {
2888
+ if (!src1_uma) {
2805
2889
  d_Qy = extra_src1->buffer_gpu.lock();
2806
2890
  qy_buf_offset = extra_src1->offset;
2807
2891
  GGML_ASSERT(d_Qx != nullptr);
@@ -2816,10 +2900,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2816
2900
  const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
2817
2901
  const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
2818
2902
 
2819
- if (load_y) {
2820
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
2821
- }
2822
-
2823
2903
  // compute
2824
2904
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
2825
2905
  ggml_vk_sync_buffers(subctx);
@@ -2875,8 +2955,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2875
2955
  src1_uma = d_Qy != nullptr;
2876
2956
  }
2877
2957
 
2878
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2879
-
2880
2958
  const uint64_t d_ne = ne01 * ne11 * ne12;
2881
2959
 
2882
2960
  const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
@@ -2892,9 +2970,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2892
2970
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
2893
2971
  const uint64_t qx_buf_offset = extra_src0->offset;
2894
2972
  GGML_ASSERT(d_Qx != nullptr);
2895
- if (load_y) {
2896
- d_Qy = ctx->prealloc_qy;
2897
- } else {
2973
+ if (!src1_uma) {
2898
2974
  d_Qy = extra_src1->buffer_gpu.lock();
2899
2975
  qy_buf_offset = extra_src1->offset;
2900
2976
  GGML_ASSERT(d_Qx != nullptr);
@@ -2909,10 +2985,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2909
2985
  const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
2910
2986
  const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
2911
2987
 
2912
- if (load_y) {
2913
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
2914
- }
2915
-
2916
2988
  // compute
2917
2989
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
2918
2990
  ggml_vk_sync_buffers(subctx);
@@ -3168,7 +3240,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3168
3240
  }
3169
3241
  std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3170
3242
  #endif
3171
- GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT
3243
+ GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3172
3244
  GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
3173
3245
  GGML_ASSERT(dst->extra != nullptr);
3174
3246
  const uint64_t ne00 = src0->ne[0];
@@ -3236,11 +3308,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3236
3308
  }
3237
3309
  }
3238
3310
 
3239
- const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
3240
- const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
3241
- const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma;
3242
-
3243
- uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3311
+ uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3244
3312
  uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
3245
3313
  uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
3246
3314
  uint64_t d_sz = ggml_type_size(dst->type) * ne0;
@@ -3255,55 +3323,43 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3255
3323
  GGML_ASSERT(d_D != nullptr);
3256
3324
  uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
3257
3325
  GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
3258
- if (transfer_src0) {
3259
- d_X = ctx->prealloc_qx;
3260
- } else if(!src0_uma) {
3326
+ if(!src0_uma) {
3261
3327
  d_X = extra_src0->buffer_gpu.lock();
3262
3328
  x_buf_offset = extra_src0->offset;
3263
3329
  GGML_ASSERT(d_X != nullptr);
3264
3330
  }
3265
- if (transfer_src1) {
3266
- d_Y = ctx->prealloc_qy;
3267
- } else if (use_src1 && !src1_uma) {
3331
+ if (use_src1 && !src1_uma) {
3268
3332
  d_Y = extra_src1->buffer_gpu.lock();
3269
3333
  y_buf_offset = extra_src1->offset;
3270
3334
  GGML_ASSERT(d_Y != nullptr);
3271
3335
  }
3272
3336
 
3273
- GGML_ASSERT(!transfer_src2);
3274
3337
  if (use_src2 && !src2_uma) {
3275
3338
  d_Z = extra_src2->buffer_gpu.lock();
3276
3339
  z_buf_offset = extra_src2->offset;
3277
3340
  GGML_ASSERT(d_Z != nullptr);
3278
3341
  }
3279
3342
 
3280
- if (op == GGML_OP_CPY) {
3281
- GGML_ASSERT(!transfer_src0);
3282
- GGML_ASSERT(!transfer_src1);
3343
+ if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
3283
3344
  x_sz = ggml_nbytes(src0);
3345
+ y_sz = use_src1 ? ggml_nbytes(src1) : 0;
3284
3346
  d_sz = ggml_nbytes(dst);
3285
3347
 
3286
- if (extra_src0->offset + x_sz >= d_X->size) {
3348
+ if (x_buf_offset + x_sz >= d_X->size) {
3287
3349
  x_sz = VK_WHOLE_SIZE;
3288
3350
  }
3289
- if (extra->offset + d_sz >= d_D->size) {
3351
+ if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
3352
+ y_sz = VK_WHOLE_SIZE;
3353
+ }
3354
+ if (d_buf_offset + d_sz >= d_D->size) {
3290
3355
  d_sz = VK_WHOLE_SIZE;
3291
3356
  }
3292
3357
  }
3293
3358
 
3294
3359
  std::array<uint32_t, 3> elements;
3295
3360
 
3296
- // copy src0 to device
3297
- if (transfer_src0) {
3298
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
3299
- ctx->staging_offset = x_sz * ne02 * ne03;
3300
- }
3301
- if (transfer_src1) {
3302
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
3303
- }
3304
-
3305
3361
  // Single call if dimension 2 is contiguous
3306
- if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
3362
+ if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
3307
3363
  ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
3308
3364
 
3309
3365
  switch (dst->op) {
@@ -3316,16 +3372,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3316
3372
  case GGML_OP_ROPE:
3317
3373
  elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
3318
3374
  break;
3375
+ case GGML_OP_GET_ROWS:
3376
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
3377
+ break;
3319
3378
  default:
3320
3379
  elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
3321
3380
  break;
3322
3381
  }
3323
3382
 
3324
- if (op != GGML_OP_CPY) {
3383
+ if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
3325
3384
  if (x_sz != VK_WHOLE_SIZE) {
3326
3385
  x_sz *= ne02 * ne03;
3327
3386
  }
3328
- if (y_sz != VK_WHOLE_SIZE) {
3387
+ if (use_src1 && y_sz != VK_WHOLE_SIZE) {
3329
3388
  y_sz *= ne12 * ne13;
3330
3389
  }
3331
3390
  if (d_sz != VK_WHOLE_SIZE) {
@@ -3380,6 +3439,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3380
3439
  case GGML_OP_ROPE:
3381
3440
  elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
3382
3441
  break;
3442
+ case GGML_OP_GET_ROWS:
3443
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
3444
+ break;
3383
3445
  default:
3384
3446
  elements = { (uint32_t)ne0, 1, 1 };
3385
3447
  break;
@@ -3414,7 +3476,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c
3414
3476
  }
3415
3477
 
3416
3478
  static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3417
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
3479
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
3480
+ const uint32_t src1_type_size = ggml_type_size(src1->type);
3481
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
3482
+
3483
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
3484
+ (uint32_t)ggml_nelements(src0),
3485
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
3486
+ (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
3487
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
3488
+ 0,
3489
+ 0.0f, 0.0f,
3490
+ });
3418
3491
  }
3419
3492
 
3420
3493
  static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3570,9 +3643,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
3570
3643
  if (is_neox) {
3571
3644
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
3572
3645
  const float inv_ndims = -1.0f / n_dims;
3573
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
3646
+ ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
3574
3647
  } else {
3575
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
3648
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
3576
3649
  }
3577
3650
  }
3578
3651
 
@@ -3581,16 +3654,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
3581
3654
  ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
3582
3655
  }
3583
3656
 
3584
- static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
3585
- // If backend is CPU, data from src0 has to be copied off the device
3586
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3587
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3588
- vk_buffer d_D = extra_src0->buffer_gpu.lock();
3589
- ggml_vk_sync_buffers(subctx);
3590
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size);
3591
- }
3592
- }
3593
-
3594
3657
  #ifdef GGML_VULKAN_RUN_TESTS
3595
3658
  static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
3596
3659
  if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
@@ -3613,6 +3676,8 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
3613
3676
  val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
3614
3677
  } else if (type == GGML_TYPE_F16) {
3615
3678
  val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
3679
+ } else {
3680
+ GGML_ASSERT(false);
3616
3681
  }
3617
3682
  fprintf(stderr, "% 7.2f ", val);
3618
3683
  } else {
@@ -3914,6 +3979,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
3914
3979
  val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
3915
3980
  } else if (tensor->type == GGML_TYPE_F16) {
3916
3981
  val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
3982
+ } else {
3983
+ GGML_ASSERT(false);
3917
3984
  }
3918
3985
  fprintf(stderr, "% 7.2f ", val);
3919
3986
  } else {
@@ -4329,7 +4396,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
4329
4396
 
4330
4397
  std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
4331
4398
 
4332
- if (avg_err > 0.1 || std::isnan(avg_err)) {
4399
+ if (avg_err > 0.01 || std::isnan(avg_err)) {
4333
4400
  std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
4334
4401
  std::cerr << "Actual result: " << std::endl << std::endl;
4335
4402
  ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@@ -4379,27 +4446,15 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
4379
4446
  return extra;
4380
4447
  }
4381
4448
 
4382
- static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) {
4383
- return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID;
4384
- }
4385
-
4386
4449
  static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
4387
4450
  #ifdef GGML_VULKAN_DEBUG
4388
4451
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
4389
4452
  #endif
4390
- const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4391
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4392
- || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
4393
-
4394
- if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
4453
+ if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
4395
4454
  return;
4396
4455
  }
4397
4456
 
4398
4457
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
4399
- if (extra == nullptr) {
4400
- // Workaround for CPU backend BLAS matmul calls
4401
- extra = ggml_vk_tensor_create_extra(node);
4402
- }
4403
4458
 
4404
4459
  ggml_tensor * src0 = node->src[0];
4405
4460
  ggml_tensor * src1 = node->src[1];
@@ -4419,7 +4474,18 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
4419
4474
  const int64_t ne22 = node->ne[2];
4420
4475
  const int64_t ne23 = node->ne[3];
4421
4476
 
4422
- const bool f16_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32;
4477
+ const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
4478
+ const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
4479
+
4480
+ const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
4481
+ const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
4482
+
4483
+ const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
4484
+
4485
+ bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
4486
+
4487
+ const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
4488
+ const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
4423
4489
 
4424
4490
  int split_k;
4425
4491
  if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
@@ -4431,10 +4497,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
4431
4497
  const uint32_t y_ne = ne10 * ne11;
4432
4498
  const uint32_t d_ne = ne20 * ne21;
4433
4499
 
4434
- const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
4435
- const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
4436
- const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
4437
- const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
4500
+ const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
4501
+ const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
4438
4502
  uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
4439
4503
  const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
4440
4504
 
@@ -4477,12 +4541,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
4477
4541
  break;
4478
4542
  case GGML_OP_MUL_MAT:
4479
4543
  case GGML_OP_MUL_MAT_ID:
4480
- if (ctx->prealloc_size_qx < qx_sz) {
4481
- ctx->prealloc_size_qx = qx_sz;
4482
- }
4483
- if (ctx->prealloc_size_qy < qy_sz) {
4484
- ctx->prealloc_size_qy = qy_sz;
4485
- }
4486
4544
  if (ctx->prealloc_size_x < x_sz) {
4487
4545
  ctx->prealloc_size_x = x_sz;
4488
4546
  }
@@ -4506,7 +4564,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4506
4564
  return;
4507
4565
  }
4508
4566
  #ifdef GGML_VULKAN_DEBUG
4509
- std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
4567
+ std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
4510
4568
  #endif
4511
4569
  #if defined(GGML_VULKAN_RUN_TESTS)
4512
4570
  ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
@@ -4569,6 +4627,41 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4569
4627
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
4570
4628
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
4571
4629
 
4630
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
4631
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
4632
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
4633
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
4634
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
4635
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
4636
+
4637
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
4638
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
4639
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
4640
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
4641
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
4642
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
4643
+
4644
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
4645
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
4646
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
4647
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
4648
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
4649
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
4650
+
4651
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
4652
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
4653
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
4654
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
4655
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
4656
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
4657
+
4658
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
4659
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
4660
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
4661
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
4662
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
4663
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
4664
+
4572
4665
  std::cerr << std::endl;
4573
4666
 
4574
4667
  const std::vector<size_t> vals {
@@ -4608,20 +4701,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4608
4701
  GGML_ASSERT(false);
4609
4702
  #endif
4610
4703
 
4611
- if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) {
4612
- // Resize buffer
4613
- if (ctx->prealloc_qx != nullptr) {
4614
- ggml_vk_destroy_buffer(ctx->prealloc_qx);
4615
- }
4616
- ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx);
4617
- }
4618
- if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) {
4619
- // Resize buffer
4620
- if (ctx->prealloc_qy != nullptr) {
4621
- ggml_vk_destroy_buffer(ctx->prealloc_qy);
4622
- }
4623
- ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy);
4624
- }
4625
4704
  if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
4626
4705
  // Resize buffer
4627
4706
  if (ctx->prealloc_x != nullptr) {
@@ -4655,11 +4734,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4655
4734
  }
4656
4735
 
4657
4736
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
4658
- const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4659
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4660
- || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4661
-
4662
- if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
4737
+ if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
4663
4738
  return;
4664
4739
  }
4665
4740
 
@@ -4687,7 +4762,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4687
4762
  }
4688
4763
  break;
4689
4764
  case GGML_OP_REPEAT:
4690
- // case GGML_OP_GET_ROWS:
4765
+ case GGML_OP_GET_ROWS:
4691
4766
  case GGML_OP_ADD:
4692
4767
  case GGML_OP_MUL:
4693
4768
  case GGML_OP_SCALE:
@@ -4711,10 +4786,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4711
4786
  case GGML_OP_ARGSORT:
4712
4787
  break;
4713
4788
  default:
4714
- if (any_on_device) {
4715
- std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
4716
- GGML_ASSERT(false);
4717
- }
4789
+ std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
4790
+ GGML_ASSERT(false);
4718
4791
  return;
4719
4792
  }
4720
4793
 
@@ -4763,8 +4836,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4763
4836
  case GGML_OP_PERMUTE:
4764
4837
  case GGML_OP_TRANSPOSE:
4765
4838
  case GGML_OP_NONE:
4766
- ggml_vk_nop(ctx, ctx->compute_ctx, src0, node);
4767
-
4768
4839
  break;
4769
4840
  case GGML_OP_NORM:
4770
4841
  ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@@ -4831,11 +4902,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4831
4902
  }
4832
4903
 
4833
4904
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
4834
- const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
4835
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4836
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4837
-
4838
- if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
4905
+ if (ctx->disable) {
4839
4906
  return false;
4840
4907
  }
4841
4908
 
@@ -4878,10 +4945,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
4878
4945
  break;
4879
4946
  case GGML_OP_MUL_MAT:
4880
4947
  case GGML_OP_MUL_MAT_ID:
4881
- if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
4882
- return false;
4883
- }
4884
-
4885
4948
  extra = (ggml_tensor_extra_gpu *) tensor->extra;
4886
4949
 
4887
4950
  break;
@@ -4995,8 +5058,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
4995
5058
  #endif
4996
5059
  ggml_vk_graph_cleanup(ctx);
4997
5060
 
4998
- ggml_vk_destroy_buffer(ctx->prealloc_qx);
4999
- ggml_vk_destroy_buffer(ctx->prealloc_qy);
5000
5061
  ggml_vk_destroy_buffer(ctx->prealloc_x);
5001
5062
  ggml_vk_destroy_buffer(ctx->prealloc_y);
5002
5063
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
@@ -5007,8 +5068,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5007
5068
  ggml_vk_destroy_buffer(buffer);
5008
5069
  }
5009
5070
 
5010
- ctx->prealloc_size_qx = 0;
5011
- ctx->prealloc_size_qy = 0;
5012
5071
  ctx->prealloc_size_x = 0;
5013
5072
  ctx->prealloc_size_y = 0;
5014
5073
  ctx->prealloc_size_split_k = 0;
@@ -5039,80 +5098,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
5039
5098
  snprintf(description, description_size, "%s", props.deviceName.data());
5040
5099
  }
5041
5100
 
5042
- // CPU assist interface
5043
-
5044
- void ggml_vk_init_cpu_assist() {
5045
- ggml_vk_instance_init();
5046
-
5047
- std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
5048
-
5049
- for (int i = 0; i < ggml_vk_get_device_count(); i++) {
5050
- ggml_vk_print_gpu_info(i);
5051
- }
5052
- // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
5053
- ggml_backend_vk_init(0);
5054
- }
5055
-
5056
- void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
5057
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5058
-
5059
- if (!ctx->initialized) {
5060
- return;
5061
- }
5062
-
5063
- ggml_vk_preallocate_buffers_graph(ctx, node);
5064
- }
5065
-
5066
- void ggml_vk_preallocate_buffers_cpu_assist() {
5067
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5068
-
5069
- if (!ctx->initialized) {
5070
- return;
5071
- }
5072
-
5073
- ggml_vk_preallocate_buffers(ctx);
5074
- }
5075
-
5076
- void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
5077
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5078
-
5079
- if (!ctx->initialized) {
5080
- return;
5081
- }
5082
-
5083
- ggml_vk_build_graph(ctx, node, last_node);
5084
- }
5085
-
5086
- bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
5087
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5088
-
5089
- if (!ctx->initialized) {
5090
- return false;
5091
- }
5092
-
5093
- return ggml_vk_compute_forward(ctx, params, tensor);
5094
- }
5095
-
5096
- void ggml_vk_graph_cleanup_cpu_assist() {
5097
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5098
-
5099
- if (!ctx->initialized) {
5100
- return;
5101
- }
5102
-
5103
- ggml_vk_graph_cleanup(ctx);
5104
- }
5105
-
5106
- void ggml_vk_free_cpu_assist() {
5107
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5108
-
5109
- if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
5110
- return;
5111
- }
5112
-
5113
- ggml_backend_vk_free(vk_instance.backends[0]);
5114
- }
5115
-
5116
5101
  // backend interface
5117
5102
 
5118
5103
  #define UNUSED GGML_UNUSED
@@ -5324,16 +5309,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
5324
5309
  /* .is_host = */ NULL,
5325
5310
  };
5326
5311
 
5327
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t idx) {
5312
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
5328
5313
  #ifdef GGML_VULKAN_DEBUG
5329
- std::cerr << "ggml_backend_vk_buffer_type(" << idx << ")" << std::endl;
5314
+ std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
5330
5315
  #endif
5331
5316
 
5332
- GGML_ASSERT(idx < vk_instance.device_indices.size());
5317
+ GGML_ASSERT(dev_num < vk_instance.device_indices.size());
5333
5318
 
5334
- ggml_backend_vk_init(idx);
5319
+ ggml_backend_vk_init(dev_num);
5335
5320
 
5336
- return &vk_instance.buffer_types[idx];
5321
+ return &vk_instance.buffer_types[dev_num];
5337
5322
  }
5338
5323
 
5339
5324
  // host buffer type
@@ -5502,7 +5487,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
5502
5487
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
5503
5488
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
5504
5489
 
5505
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, src_buf, src_extra->offset, dst_buf, dst_extra->offset, ggml_nbytes(src));
5490
+ ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
5506
5491
  return true;
5507
5492
  }
5508
5493
 
@@ -5536,6 +5521,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
5536
5521
  }
5537
5522
 
5538
5523
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
5524
+ #ifdef GGML_VULKAN_DEBUG
5525
+ std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
5526
+ #endif
5539
5527
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
5540
5528
 
5541
5529
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -5560,7 +5548,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
5560
5548
  for (int i = 0; i < cgraph->n_nodes; i++) {
5561
5549
  ggml_tensor * node = cgraph->nodes[i];
5562
5550
 
5563
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
5551
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
5564
5552
  continue;
5565
5553
  }
5566
5554
 
@@ -5596,8 +5584,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5596
5584
  }
5597
5585
  break;
5598
5586
  case GGML_OP_MUL_MAT:
5599
- case GGML_OP_MUL_MAT_ID:
5587
+ // case GGML_OP_MUL_MAT_ID:
5600
5588
  {
5589
+ switch (op->src[0]->type) {
5590
+ case GGML_TYPE_F32:
5591
+ case GGML_TYPE_F16:
5592
+ case GGML_TYPE_Q4_0:
5593
+ case GGML_TYPE_Q4_1:
5594
+ case GGML_TYPE_Q5_0:
5595
+ case GGML_TYPE_Q5_1:
5596
+ case GGML_TYPE_Q8_0:
5597
+ case GGML_TYPE_Q2_K:
5598
+ case GGML_TYPE_Q3_K:
5599
+ case GGML_TYPE_Q4_K:
5600
+ case GGML_TYPE_Q5_K:
5601
+ case GGML_TYPE_Q6_K:
5602
+ break;
5603
+ default:
5604
+ return false;
5605
+ }
5601
5606
  struct ggml_tensor * a;
5602
5607
  struct ggml_tensor * b;
5603
5608
  if (op->op == GGML_OP_MUL_MAT) {
@@ -5612,25 +5617,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5612
5617
  }
5613
5618
  return true;
5614
5619
  } break;
5615
- // case GGML_OP_GET_ROWS:
5616
- // {
5617
- // switch (op->src[0]->type) {
5618
- // case GGML_TYPE_F16:
5619
- // case GGML_TYPE_F32:
5620
- // case GGML_TYPE_Q4_0:
5621
- // case GGML_TYPE_Q4_1:
5622
- // case GGML_TYPE_Q5_0:
5623
- // case GGML_TYPE_Q5_1:
5624
- // case GGML_TYPE_Q8_0:
5625
- // return true;
5626
- // default:
5627
- // return false;
5628
- // }
5629
- // } break;
5620
+ case GGML_OP_GET_ROWS:
5621
+ {
5622
+ switch (op->src[0]->type) {
5623
+ case GGML_TYPE_F32:
5624
+ case GGML_TYPE_F16:
5625
+ case GGML_TYPE_Q4_0:
5626
+ case GGML_TYPE_Q4_1:
5627
+ case GGML_TYPE_Q5_0:
5628
+ case GGML_TYPE_Q5_1:
5629
+ case GGML_TYPE_Q8_0:
5630
+ return true;
5631
+ default:
5632
+ return false;
5633
+ }
5634
+ } break;
5630
5635
  case GGML_OP_CPY:
5636
+ case GGML_OP_DUP:
5631
5637
  {
5632
5638
  ggml_type src0_type = op->src[0]->type;
5633
- ggml_type src1_type = op->src[1]->type;
5639
+ ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
5634
5640
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
5635
5641
  return true;
5636
5642
  }
@@ -5642,7 +5648,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5642
5648
  }
5643
5649
  return false;
5644
5650
  } break;
5645
- case GGML_OP_DUP:
5646
5651
  // case GGML_OP_REPEAT:
5647
5652
  // {
5648
5653
  // ggml_type src0_type = op->src[0]->type;
@@ -5679,6 +5684,20 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5679
5684
  UNUSED(backend);
5680
5685
  }
5681
5686
 
5687
+ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
5688
+ const ggml_tensor * dst = op;
5689
+
5690
+ const int min_batch_size = 32;
5691
+
5692
+ if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
5693
+ return true;
5694
+ }
5695
+
5696
+ return false;
5697
+
5698
+ UNUSED(backend);
5699
+ }
5700
+
5682
5701
  // TODO: enable async and synchronize
5683
5702
  static ggml_backend_i ggml_backend_vk_interface = {
5684
5703
  /* .get_name = */ ggml_backend_vk_name,
@@ -5693,6 +5712,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
5693
5712
  /* .graph_plan_compute = */ NULL,
5694
5713
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
5695
5714
  /* .supports_op = */ ggml_backend_vk_supports_op,
5715
+ /* .offload_op = */ ggml_backend_vk_offload_op,
5696
5716
  /* .event_new = */ NULL,
5697
5717
  /* .event_free = */ NULL,
5698
5718
  /* .event_record = */ NULL,
@@ -5705,22 +5725,22 @@ static ggml_guid_t ggml_backend_vk_guid() {
5705
5725
  return &guid;
5706
5726
  }
5707
5727
 
5708
- GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5709
- if (vk_instance.initialized[idx]) {
5710
- return vk_instance.backends[idx];
5728
+ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
5729
+ if (vk_instance.initialized[dev_num]) {
5730
+ return vk_instance.backends[dev_num];
5711
5731
  }
5712
5732
  #ifdef GGML_VULKAN_DEBUG
5713
- std::cerr << "ggml_backend_vk_init(" << idx << ")" << std::endl;
5733
+ std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
5714
5734
  #endif
5715
5735
 
5716
- ggml_backend_vk_context * ctx = &vk_instance.contexts[idx];
5717
- ggml_vk_init(ctx, idx);
5718
- ctx->name = GGML_VK_NAME + std::to_string(idx);
5719
- vk_instance.buffer_types[idx] = {
5736
+ ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
5737
+ ggml_vk_init(ctx, dev_num);
5738
+ ctx->name = GGML_VK_NAME + std::to_string(dev_num);
5739
+ vk_instance.buffer_types[dev_num] = {
5720
5740
  /* .iface = */ ggml_backend_vk_buffer_type_interface,
5721
5741
  /* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
5722
5742
  };
5723
- vk_instance.initialized[idx] = true;
5743
+ vk_instance.initialized[dev_num] = true;
5724
5744
 
5725
5745
  ggml_backend_t vk_backend = new ggml_backend {
5726
5746
  /* .guid = */ ggml_backend_vk_guid(),
@@ -5728,7 +5748,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5728
5748
  /* .context = */ &vk_instance.contexts[ctx->idx],
5729
5749
  };
5730
5750
 
5731
- vk_instance.backends[idx] = vk_backend;
5751
+ vk_instance.backends[dev_num] = vk_backend;
5732
5752
 
5733
5753
  return vk_backend;
5734
5754
  }
@@ -5772,10 +5792,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo
5772
5792
  extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
5773
5793
 
5774
5794
  GGML_CALL int ggml_backend_vk_reg_devices() {
5775
- for (auto idx : vk_instance.device_indices) {
5795
+ ggml_vk_instance_init();
5796
+
5797
+ for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
5776
5798
  char name[128];
5777
- snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, idx);
5778
- ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(idx), (void *) (intptr_t) idx);
5799
+ snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
5800
+ ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
5779
5801
  }
5780
5802
  return vk_instance.device_indices.size();
5781
5803
  }
@@ -5859,6 +5881,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
5859
5881
  val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
5860
5882
  } else if (tensor->type == GGML_TYPE_F16) {
5861
5883
  val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
5884
+ } else {
5885
+ GGML_ASSERT(false);
5862
5886
  }
5863
5887
  fprintf(stderr, "% 7.2f ", val);
5864
5888
  } else {
@@ -5953,6 +5977,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5953
5977
  return;
5954
5978
  }
5955
5979
 
5980
+ #ifdef GGML_VULKAN_DEBUG
5981
+ std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
5982
+ #endif
5983
+
5956
5984
  ggml_tensor * src0 = tensor->src[0];
5957
5985
  ggml_tensor * src1 = tensor->src[1];
5958
5986
  ggml_tensor * src2 = tensor->src[2];
@@ -6212,6 +6240,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6212
6240
  tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
6213
6241
  } else if (tensor->op == GGML_OP_TRANSPOSE) {
6214
6242
  tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
6243
+ } else if (tensor->op == GGML_OP_GET_ROWS) {
6244
+ tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
6215
6245
  } else {
6216
6246
  std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6217
6247
  GGML_ASSERT(false);
@@ -6262,6 +6292,10 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6262
6292
  return;
6263
6293
  }
6264
6294
 
6295
+ #ifdef GGML_VULKAN_DEBUG
6296
+ std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
6297
+ #endif
6298
+
6265
6299
  ggml_tensor * src0 = tensor->src[0];
6266
6300
  ggml_tensor * src1 = tensor->src[1];
6267
6301
 
@@ -6405,10 +6439,4 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6405
6439
  free(tensor_data);
6406
6440
  }
6407
6441
  }
6408
-
6409
- void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
6410
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
6411
-
6412
- ggml_vk_check_results_0(ctx, params, tensor);
6413
- }
6414
6442
  #endif