llama_cpp 0.14.2 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,6 @@
9
9
  #include <algorithm>
10
10
  #include <cmath>
11
11
  #include <iostream>
12
- #include <iomanip>
13
12
  #include <limits>
14
13
  #include <tuple>
15
14
  #include <vector>
@@ -340,8 +339,8 @@ struct ggml_backend_vk_context {
340
339
  size_t semaphore_idx, event_idx;
341
340
  ggml_vk_garbage_collector gc;
342
341
  std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
343
- size_t prealloc_size_qx, prealloc_size_qy, prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
344
- vk_buffer prealloc_qx, prealloc_qy, prealloc_x, prealloc_y, prealloc_split_k;
342
+ size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
343
+ vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
345
344
  vk::Fence fence;
346
345
  vk_buffer staging;
347
346
  size_t staging_size;
@@ -710,6 +709,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
710
709
  }
711
710
  }
712
711
 
712
+ // All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
713
+ // Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
714
+ if (compute_index >= 0) {
715
+ return compute_index;
716
+ }
717
+
713
718
  std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
714
719
 
715
720
  for(auto &q_family : queue_family_props) {
@@ -803,7 +808,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
803
808
 
804
809
  static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
805
810
  #ifdef GGML_VULKAN_DEBUG
806
- std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
811
+ std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
807
812
  #endif
808
813
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
809
814
 
@@ -992,6 +997,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
992
997
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
993
998
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
994
999
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
1000
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
1001
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
1002
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
1003
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
1004
+ ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
995
1005
 
996
1006
  if (device->fp16) {
997
1007
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
@@ -1049,6 +1059,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1049
1059
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1050
1060
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1051
1061
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1062
+
1063
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1064
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1065
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1066
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1067
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1068
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1069
+
1070
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1071
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1072
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1073
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1074
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1075
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1076
+
1077
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1078
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1079
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1080
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1081
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1082
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1083
+
1084
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1085
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1086
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1087
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1088
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1089
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1090
+
1091
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1092
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1093
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1094
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1095
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1096
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1052
1097
  } else {
1053
1098
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
1054
1099
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
@@ -1105,6 +1150,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1105
1150
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1106
1151
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1107
1152
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1153
+
1154
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1155
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1156
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1157
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1158
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1159
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1160
+
1161
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1162
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1163
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1164
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1165
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1166
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1167
+
1168
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1169
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1170
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1171
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1172
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1173
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1174
+
1175
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1176
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1177
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1178
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1179
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1180
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1181
+
1182
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1183
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1184
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1185
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
1186
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
1187
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
1108
1188
  }
1109
1189
 
1110
1190
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1133,19 +1213,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1133
1213
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1134
1214
 
1135
1215
  // get_rows
1136
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1137
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1138
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1139
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1140
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1141
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1142
-
1143
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1144
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1145
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1146
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1147
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1148
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
1216
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1217
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1218
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1219
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1220
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1221
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1222
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1223
+
1224
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1225
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1226
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1227
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1228
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1229
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1230
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1149
1231
 
1150
1232
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
1151
1233
 
@@ -1335,7 +1417,33 @@ void ggml_vk_instance_init() {
1335
1417
  vk_instance.device_indices.push_back(tmp);
1336
1418
  }
1337
1419
  } else {
1338
- vk_instance.device_indices.push_back(0);
1420
+ std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
1421
+
1422
+ // Make sure at least one device exists
1423
+ if (devices.empty()) {
1424
+ std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
1425
+ GGML_ASSERT(false);
1426
+ }
1427
+
1428
+ // Default to using all dedicated GPUs
1429
+ for (size_t i = 0; i < devices.size(); i++) {
1430
+ vk::PhysicalDeviceProperties props = devices[i].getProperties();
1431
+
1432
+ if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1433
+ vk_instance.device_indices.push_back(i);
1434
+ }
1435
+ }
1436
+
1437
+ // If no dedicated GPUs found, fall back to GPU 0
1438
+ if (vk_instance.device_indices.empty()) {
1439
+ vk_instance.device_indices.push_back(0);
1440
+ }
1441
+ }
1442
+
1443
+ std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
1444
+
1445
+ for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
1446
+ ggml_vk_print_gpu_info(i);
1339
1447
  }
1340
1448
 
1341
1449
  vk_instance_initialized = true;
@@ -1561,6 +1669,15 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
1561
1669
 
1562
1670
  switch (src0_type) {
1563
1671
  case GGML_TYPE_Q4_0:
1672
+ case GGML_TYPE_Q4_1:
1673
+ case GGML_TYPE_Q5_0:
1674
+ case GGML_TYPE_Q5_1:
1675
+ case GGML_TYPE_Q8_0:
1676
+ case GGML_TYPE_Q2_K:
1677
+ case GGML_TYPE_Q3_K:
1678
+ case GGML_TYPE_Q4_K:
1679
+ case GGML_TYPE_Q5_K:
1680
+ case GGML_TYPE_Q6_K:
1564
1681
  break;
1565
1682
  default:
1566
1683
  return nullptr;
@@ -2028,7 +2145,6 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
2028
2145
  ggml_vk_submit(subctx, ctx->fence);
2029
2146
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
2030
2147
  ctx->device->device.resetFences({ ctx->fence });
2031
- ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
2032
2148
  }
2033
2149
  }
2034
2150
 
@@ -2125,7 +2241,6 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
2125
2241
  for (auto& cpy : subctx->out_memcpys) {
2126
2242
  memcpy(cpy.dst, cpy.src, cpy.n);
2127
2243
  }
2128
- ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
2129
2244
  }
2130
2245
  }
2131
2246
 
@@ -2292,6 +2407,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2292
2407
  return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
2293
2408
  case VK_VENDOR_ID_INTEL:
2294
2409
  return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
2410
+ default:
2411
+ break;
2295
2412
  }
2296
2413
 
2297
2414
  if (m <= 32 || n <= 32) {
@@ -2417,11 +2534,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2417
2534
  src1_uma = d_Qy != nullptr;
2418
2535
  }
2419
2536
 
2420
- const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2421
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2422
-
2423
- const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2424
- const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
2537
+ const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
2538
+ const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
2425
2539
 
2426
2540
  const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
2427
2541
 
@@ -2463,16 +2577,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2463
2577
  uint64_t x_buf_offset = 0;
2464
2578
  vk_buffer d_Y;
2465
2579
  uint64_t y_buf_offset = 0;
2466
- if (load_x) {
2467
- d_Qx = ctx->prealloc_qx;
2468
- } else if (!src0_uma) {
2580
+ if (!src0_uma) {
2469
2581
  d_Qx = extra_src0->buffer_gpu.lock();
2470
2582
  qx_buf_offset = extra_src0->offset;
2471
2583
  GGML_ASSERT(d_Qx != nullptr);
2472
2584
  }
2473
- if (load_y) {
2474
- d_Qy = ctx->prealloc_qy;
2475
- } else if (!src1_uma) {
2585
+ if (!src1_uma) {
2476
2586
  d_Qy = extra_src1->buffer_gpu.lock();
2477
2587
  qy_buf_offset = extra_src1->offset;
2478
2588
  GGML_ASSERT(d_Qy != nullptr);
@@ -2524,33 +2634,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2524
2634
 
2525
2635
  if (x_non_contig) {
2526
2636
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
2527
- } else if (load_x || qx_needs_dequant) {
2528
- if (load_x) {
2529
- // copy data to device
2530
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
2531
- ctx->staging_offset = qx_sz * ne02 * ne03;
2532
- }
2533
-
2534
- if (qx_needs_dequant) {
2535
- const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
2536
- ggml_vk_sync_buffers(subctx);
2537
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
2538
- }
2637
+ } else if (qx_needs_dequant) {
2638
+ const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
2639
+ ggml_vk_sync_buffers(subctx);
2640
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
2539
2641
  }
2540
2642
  if (y_non_contig) {
2541
2643
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
2542
- } else if (load_y) {
2543
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
2544
2644
  }
2545
2645
 
2546
2646
  uint32_t stride_batch_x = ne00*ne01;
2547
2647
  uint32_t stride_batch_y = ne10*ne11;
2548
2648
 
2549
- if (!ggml_vk_dim01_contiguous(src0) && !load_x && !qx_needs_dequant) {
2649
+ if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
2550
2650
  stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
2551
2651
  }
2552
2652
 
2553
- if (!ggml_vk_dim01_contiguous(src1) && !load_y && !qy_needs_dequant) {
2653
+ if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
2554
2654
  stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
2555
2655
  }
2556
2656
 
@@ -2610,11 +2710,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2610
2710
  src1_uma = d_Qy != nullptr;
2611
2711
  }
2612
2712
 
2613
- const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
2614
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2615
-
2616
- const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
2617
- const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
2713
+ const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
2714
+ const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
2618
2715
 
2619
2716
  const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
2620
2717
 
@@ -2638,16 +2735,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2638
2735
  uint64_t x_buf_offset = 0;
2639
2736
  vk_buffer d_Y;
2640
2737
  uint64_t y_buf_offset = 0;
2641
- if (load_x) {
2642
- d_Qx = ctx->prealloc_qx;
2643
- } else if(!src1_uma) {
2738
+ if(!src0_uma) {
2644
2739
  d_Qx = extra_src0->buffer_gpu.lock();
2645
2740
  qx_buf_offset = extra_src0->offset;
2646
2741
  GGML_ASSERT(d_Qx != nullptr);
2647
2742
  }
2648
- if (load_y) {
2649
- d_Qy = ctx->prealloc_qy;
2650
- } else if(!src1_uma) {
2743
+ if(!src1_uma) {
2651
2744
  d_Qy = extra_src1->buffer_gpu.lock();
2652
2745
  qy_buf_offset = extra_src1->offset;
2653
2746
  GGML_ASSERT(d_Qy != nullptr);
@@ -2694,15 +2787,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
2694
2787
  if (x_non_contig) {
2695
2788
  GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
2696
2789
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
2697
- } else if (load_x) {
2698
- // copy data to device
2699
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
2700
2790
  }
2701
2791
  if (y_non_contig) {
2702
2792
  GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
2703
2793
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
2704
- } else if (load_y) {
2705
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
2706
2794
  }
2707
2795
 
2708
2796
  for (uint64_t i13 = 0; i13 < ne13; i13++) {
@@ -2783,8 +2871,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2783
2871
  src1_uma = d_Qy != nullptr;
2784
2872
  }
2785
2873
 
2786
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2787
-
2788
2874
  const uint64_t x_ne = ne00 * ne01 * ne02;
2789
2875
  const uint64_t y_ne = ne10 * ne11 * ne12;
2790
2876
  const uint64_t d_ne = ne01 * ne11 * ne12;
@@ -2799,9 +2885,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2799
2885
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
2800
2886
  const uint64_t qx_buf_offset = extra_src0->offset;
2801
2887
  GGML_ASSERT(d_Qx != nullptr);
2802
- if (load_y) {
2803
- d_Qy = ctx->prealloc_qy;
2804
- } else if (!src1_uma) {
2888
+ if (!src1_uma) {
2805
2889
  d_Qy = extra_src1->buffer_gpu.lock();
2806
2890
  qy_buf_offset = extra_src1->offset;
2807
2891
  GGML_ASSERT(d_Qx != nullptr);
@@ -2816,10 +2900,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
2816
2900
  const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
2817
2901
  const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
2818
2902
 
2819
- if (load_y) {
2820
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
2821
- }
2822
-
2823
2903
  // compute
2824
2904
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
2825
2905
  ggml_vk_sync_buffers(subctx);
@@ -2875,8 +2955,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2875
2955
  src1_uma = d_Qy != nullptr;
2876
2956
  }
2877
2957
 
2878
- const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
2879
-
2880
2958
  const uint64_t d_ne = ne01 * ne11 * ne12;
2881
2959
 
2882
2960
  const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
@@ -2892,9 +2970,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2892
2970
  vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
2893
2971
  const uint64_t qx_buf_offset = extra_src0->offset;
2894
2972
  GGML_ASSERT(d_Qx != nullptr);
2895
- if (load_y) {
2896
- d_Qy = ctx->prealloc_qy;
2897
- } else {
2973
+ if (!src1_uma) {
2898
2974
  d_Qy = extra_src1->buffer_gpu.lock();
2899
2975
  qy_buf_offset = extra_src1->offset;
2900
2976
  GGML_ASSERT(d_Qx != nullptr);
@@ -2909,10 +2985,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
2909
2985
  const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
2910
2986
  const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
2911
2987
 
2912
- if (load_y) {
2913
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
2914
- }
2915
-
2916
2988
  // compute
2917
2989
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
2918
2990
  ggml_vk_sync_buffers(subctx);
@@ -3168,7 +3240,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3168
3240
  }
3169
3241
  std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3170
3242
  #endif
3171
- GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT
3243
+ GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3172
3244
  GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
3173
3245
  GGML_ASSERT(dst->extra != nullptr);
3174
3246
  const uint64_t ne00 = src0->ne[0];
@@ -3236,11 +3308,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3236
3308
  }
3237
3309
  }
3238
3310
 
3239
- const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
3240
- const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
3241
- const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma;
3242
-
3243
- uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3311
+ uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3244
3312
  uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
3245
3313
  uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
3246
3314
  uint64_t d_sz = ggml_type_size(dst->type) * ne0;
@@ -3255,55 +3323,43 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3255
3323
  GGML_ASSERT(d_D != nullptr);
3256
3324
  uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
3257
3325
  GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
3258
- if (transfer_src0) {
3259
- d_X = ctx->prealloc_qx;
3260
- } else if(!src0_uma) {
3326
+ if(!src0_uma) {
3261
3327
  d_X = extra_src0->buffer_gpu.lock();
3262
3328
  x_buf_offset = extra_src0->offset;
3263
3329
  GGML_ASSERT(d_X != nullptr);
3264
3330
  }
3265
- if (transfer_src1) {
3266
- d_Y = ctx->prealloc_qy;
3267
- } else if (use_src1 && !src1_uma) {
3331
+ if (use_src1 && !src1_uma) {
3268
3332
  d_Y = extra_src1->buffer_gpu.lock();
3269
3333
  y_buf_offset = extra_src1->offset;
3270
3334
  GGML_ASSERT(d_Y != nullptr);
3271
3335
  }
3272
3336
 
3273
- GGML_ASSERT(!transfer_src2);
3274
3337
  if (use_src2 && !src2_uma) {
3275
3338
  d_Z = extra_src2->buffer_gpu.lock();
3276
3339
  z_buf_offset = extra_src2->offset;
3277
3340
  GGML_ASSERT(d_Z != nullptr);
3278
3341
  }
3279
3342
 
3280
- if (op == GGML_OP_CPY) {
3281
- GGML_ASSERT(!transfer_src0);
3282
- GGML_ASSERT(!transfer_src1);
3343
+ if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
3283
3344
  x_sz = ggml_nbytes(src0);
3345
+ y_sz = use_src1 ? ggml_nbytes(src1) : 0;
3284
3346
  d_sz = ggml_nbytes(dst);
3285
3347
 
3286
- if (extra_src0->offset + x_sz >= d_X->size) {
3348
+ if (x_buf_offset + x_sz >= d_X->size) {
3287
3349
  x_sz = VK_WHOLE_SIZE;
3288
3350
  }
3289
- if (extra->offset + d_sz >= d_D->size) {
3351
+ if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
3352
+ y_sz = VK_WHOLE_SIZE;
3353
+ }
3354
+ if (d_buf_offset + d_sz >= d_D->size) {
3290
3355
  d_sz = VK_WHOLE_SIZE;
3291
3356
  }
3292
3357
  }
3293
3358
 
3294
3359
  std::array<uint32_t, 3> elements;
3295
3360
 
3296
- // copy src0 to device
3297
- if (transfer_src0) {
3298
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
3299
- ctx->staging_offset = x_sz * ne02 * ne03;
3300
- }
3301
- if (transfer_src1) {
3302
- ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
3303
- }
3304
-
3305
3361
  // Single call if dimension 2 is contiguous
3306
- if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
3362
+ if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
3307
3363
  ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
3308
3364
 
3309
3365
  switch (dst->op) {
@@ -3316,16 +3372,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3316
3372
  case GGML_OP_ROPE:
3317
3373
  elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
3318
3374
  break;
3375
+ case GGML_OP_GET_ROWS:
3376
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
3377
+ break;
3319
3378
  default:
3320
3379
  elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
3321
3380
  break;
3322
3381
  }
3323
3382
 
3324
- if (op != GGML_OP_CPY) {
3383
+ if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
3325
3384
  if (x_sz != VK_WHOLE_SIZE) {
3326
3385
  x_sz *= ne02 * ne03;
3327
3386
  }
3328
- if (y_sz != VK_WHOLE_SIZE) {
3387
+ if (use_src1 && y_sz != VK_WHOLE_SIZE) {
3329
3388
  y_sz *= ne12 * ne13;
3330
3389
  }
3331
3390
  if (d_sz != VK_WHOLE_SIZE) {
@@ -3380,6 +3439,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3380
3439
  case GGML_OP_ROPE:
3381
3440
  elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
3382
3441
  break;
3442
+ case GGML_OP_GET_ROWS:
3443
+ elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
3444
+ break;
3383
3445
  default:
3384
3446
  elements = { (uint32_t)ne0, 1, 1 };
3385
3447
  break;
@@ -3414,7 +3476,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c
3414
3476
  }
3415
3477
 
3416
3478
  static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3417
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
3479
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
3480
+ const uint32_t src1_type_size = ggml_type_size(src1->type);
3481
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
3482
+
3483
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
3484
+ (uint32_t)ggml_nelements(src0),
3485
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
3486
+ (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
3487
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
3488
+ 0,
3489
+ 0.0f, 0.0f,
3490
+ });
3418
3491
  }
3419
3492
 
3420
3493
  static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3570,9 +3643,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
3570
3643
  if (is_neox) {
3571
3644
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
3572
3645
  const float inv_ndims = -1.0f / n_dims;
3573
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
3646
+ ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
3574
3647
  } else {
3575
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
3648
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
3576
3649
  }
3577
3650
  }
3578
3651
 
@@ -3581,16 +3654,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
3581
3654
  ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
3582
3655
  }
3583
3656
 
3584
- static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
3585
- // If backend is CPU, data from src0 has to be copied off the device
3586
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3587
- ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3588
- vk_buffer d_D = extra_src0->buffer_gpu.lock();
3589
- ggml_vk_sync_buffers(subctx);
3590
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size);
3591
- }
3592
- }
3593
-
3594
3657
  #ifdef GGML_VULKAN_RUN_TESTS
3595
3658
  static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
3596
3659
  if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
@@ -3613,6 +3676,8 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
3613
3676
  val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
3614
3677
  } else if (type == GGML_TYPE_F16) {
3615
3678
  val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
3679
+ } else {
3680
+ GGML_ASSERT(false);
3616
3681
  }
3617
3682
  fprintf(stderr, "% 7.2f ", val);
3618
3683
  } else {
@@ -3914,6 +3979,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
3914
3979
  val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
3915
3980
  } else if (tensor->type == GGML_TYPE_F16) {
3916
3981
  val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
3982
+ } else {
3983
+ GGML_ASSERT(false);
3917
3984
  }
3918
3985
  fprintf(stderr, "% 7.2f ", val);
3919
3986
  } else {
@@ -4329,7 +4396,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
4329
4396
 
4330
4397
  std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
4331
4398
 
4332
- if (avg_err > 0.1 || std::isnan(avg_err)) {
4399
+ if (avg_err > 0.01 || std::isnan(avg_err)) {
4333
4400
  std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
4334
4401
  std::cerr << "Actual result: " << std::endl << std::endl;
4335
4402
  ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@@ -4379,27 +4446,15 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
4379
4446
  return extra;
4380
4447
  }
4381
4448
 
4382
- static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) {
4383
- return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID;
4384
- }
4385
-
4386
4449
  static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
4387
4450
  #ifdef GGML_VULKAN_DEBUG
4388
4451
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
4389
4452
  #endif
4390
- const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4391
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4392
- || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
4393
-
4394
- if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
4453
+ if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
4395
4454
  return;
4396
4455
  }
4397
4456
 
4398
4457
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
4399
- if (extra == nullptr) {
4400
- // Workaround for CPU backend BLAS matmul calls
4401
- extra = ggml_vk_tensor_create_extra(node);
4402
- }
4403
4458
 
4404
4459
  ggml_tensor * src0 = node->src[0];
4405
4460
  ggml_tensor * src1 = node->src[1];
@@ -4419,7 +4474,18 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
4419
4474
  const int64_t ne22 = node->ne[2];
4420
4475
  const int64_t ne23 = node->ne[3];
4421
4476
 
4422
- const bool f16_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32;
4477
+ const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
4478
+ const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
4479
+
4480
+ const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
4481
+ const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
4482
+
4483
+ const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
4484
+
4485
+ bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
4486
+
4487
+ const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
4488
+ const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
4423
4489
 
4424
4490
  int split_k;
4425
4491
  if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
@@ -4431,10 +4497,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
4431
4497
  const uint32_t y_ne = ne10 * ne11;
4432
4498
  const uint32_t d_ne = ne20 * ne21;
4433
4499
 
4434
- const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
4435
- const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
4436
- const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
4437
- const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
4500
+ const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
4501
+ const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
4438
4502
  uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
4439
4503
  const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
4440
4504
 
@@ -4477,12 +4541,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
4477
4541
  break;
4478
4542
  case GGML_OP_MUL_MAT:
4479
4543
  case GGML_OP_MUL_MAT_ID:
4480
- if (ctx->prealloc_size_qx < qx_sz) {
4481
- ctx->prealloc_size_qx = qx_sz;
4482
- }
4483
- if (ctx->prealloc_size_qy < qy_sz) {
4484
- ctx->prealloc_size_qy = qy_sz;
4485
- }
4486
4544
  if (ctx->prealloc_size_x < x_sz) {
4487
4545
  ctx->prealloc_size_x = x_sz;
4488
4546
  }
@@ -4506,7 +4564,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4506
4564
  return;
4507
4565
  }
4508
4566
  #ifdef GGML_VULKAN_DEBUG
4509
- std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
4567
+ std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
4510
4568
  #endif
4511
4569
  #if defined(GGML_VULKAN_RUN_TESTS)
4512
4570
  ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
@@ -4569,6 +4627,41 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4569
4627
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
4570
4628
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
4571
4629
 
4630
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
4631
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
4632
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
4633
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
4634
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
4635
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
4636
+
4637
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
4638
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
4639
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
4640
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
4641
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
4642
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
4643
+
4644
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
4645
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
4646
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
4647
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
4648
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
4649
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
4650
+
4651
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
4652
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
4653
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
4654
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
4655
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
4656
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
4657
+
4658
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
4659
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
4660
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
4661
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
4662
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
4663
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
4664
+
4572
4665
  std::cerr << std::endl;
4573
4666
 
4574
4667
  const std::vector<size_t> vals {
@@ -4608,20 +4701,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4608
4701
  GGML_ASSERT(false);
4609
4702
  #endif
4610
4703
 
4611
- if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) {
4612
- // Resize buffer
4613
- if (ctx->prealloc_qx != nullptr) {
4614
- ggml_vk_destroy_buffer(ctx->prealloc_qx);
4615
- }
4616
- ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx);
4617
- }
4618
- if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) {
4619
- // Resize buffer
4620
- if (ctx->prealloc_qy != nullptr) {
4621
- ggml_vk_destroy_buffer(ctx->prealloc_qy);
4622
- }
4623
- ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy);
4624
- }
4625
4704
  if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
4626
4705
  // Resize buffer
4627
4706
  if (ctx->prealloc_x != nullptr) {
@@ -4655,11 +4734,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
4655
4734
  }
4656
4735
 
4657
4736
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
4658
- const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
4659
- || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4660
- || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4661
-
4662
- if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
4737
+ if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
4663
4738
  return;
4664
4739
  }
4665
4740
 
@@ -4687,7 +4762,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4687
4762
  }
4688
4763
  break;
4689
4764
  case GGML_OP_REPEAT:
4690
- // case GGML_OP_GET_ROWS:
4765
+ case GGML_OP_GET_ROWS:
4691
4766
  case GGML_OP_ADD:
4692
4767
  case GGML_OP_MUL:
4693
4768
  case GGML_OP_SCALE:
@@ -4711,10 +4786,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4711
4786
  case GGML_OP_ARGSORT:
4712
4787
  break;
4713
4788
  default:
4714
- if (any_on_device) {
4715
- std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
4716
- GGML_ASSERT(false);
4717
- }
4789
+ std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
4790
+ GGML_ASSERT(false);
4718
4791
  return;
4719
4792
  }
4720
4793
 
@@ -4763,8 +4836,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4763
4836
  case GGML_OP_PERMUTE:
4764
4837
  case GGML_OP_TRANSPOSE:
4765
4838
  case GGML_OP_NONE:
4766
- ggml_vk_nop(ctx, ctx->compute_ctx, src0, node);
4767
-
4768
4839
  break;
4769
4840
  case GGML_OP_NORM:
4770
4841
  ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@@ -4831,11 +4902,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
4831
4902
  }
4832
4903
 
4833
4904
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
4834
- const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
4835
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
4836
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
4837
-
4838
- if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
4905
+ if (ctx->disable) {
4839
4906
  return false;
4840
4907
  }
4841
4908
 
@@ -4878,10 +4945,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
4878
4945
  break;
4879
4946
  case GGML_OP_MUL_MAT:
4880
4947
  case GGML_OP_MUL_MAT_ID:
4881
- if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
4882
- return false;
4883
- }
4884
-
4885
4948
  extra = (ggml_tensor_extra_gpu *) tensor->extra;
4886
4949
 
4887
4950
  break;
@@ -4995,8 +5058,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
4995
5058
  #endif
4996
5059
  ggml_vk_graph_cleanup(ctx);
4997
5060
 
4998
- ggml_vk_destroy_buffer(ctx->prealloc_qx);
4999
- ggml_vk_destroy_buffer(ctx->prealloc_qy);
5000
5061
  ggml_vk_destroy_buffer(ctx->prealloc_x);
5001
5062
  ggml_vk_destroy_buffer(ctx->prealloc_y);
5002
5063
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
@@ -5007,8 +5068,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5007
5068
  ggml_vk_destroy_buffer(buffer);
5008
5069
  }
5009
5070
 
5010
- ctx->prealloc_size_qx = 0;
5011
- ctx->prealloc_size_qy = 0;
5012
5071
  ctx->prealloc_size_x = 0;
5013
5072
  ctx->prealloc_size_y = 0;
5014
5073
  ctx->prealloc_size_split_k = 0;
@@ -5039,80 +5098,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
5039
5098
  snprintf(description, description_size, "%s", props.deviceName.data());
5040
5099
  }
5041
5100
 
5042
- // CPU assist interface
5043
-
5044
- void ggml_vk_init_cpu_assist() {
5045
- ggml_vk_instance_init();
5046
-
5047
- std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
5048
-
5049
- for (int i = 0; i < ggml_vk_get_device_count(); i++) {
5050
- ggml_vk_print_gpu_info(i);
5051
- }
5052
- // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
5053
- ggml_backend_vk_init(0);
5054
- }
5055
-
5056
- void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
5057
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5058
-
5059
- if (!ctx->initialized) {
5060
- return;
5061
- }
5062
-
5063
- ggml_vk_preallocate_buffers_graph(ctx, node);
5064
- }
5065
-
5066
- void ggml_vk_preallocate_buffers_cpu_assist() {
5067
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5068
-
5069
- if (!ctx->initialized) {
5070
- return;
5071
- }
5072
-
5073
- ggml_vk_preallocate_buffers(ctx);
5074
- }
5075
-
5076
- void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
5077
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5078
-
5079
- if (!ctx->initialized) {
5080
- return;
5081
- }
5082
-
5083
- ggml_vk_build_graph(ctx, node, last_node);
5084
- }
5085
-
5086
- bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
5087
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5088
-
5089
- if (!ctx->initialized) {
5090
- return false;
5091
- }
5092
-
5093
- return ggml_vk_compute_forward(ctx, params, tensor);
5094
- }
5095
-
5096
- void ggml_vk_graph_cleanup_cpu_assist() {
5097
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5098
-
5099
- if (!ctx->initialized) {
5100
- return;
5101
- }
5102
-
5103
- ggml_vk_graph_cleanup(ctx);
5104
- }
5105
-
5106
- void ggml_vk_free_cpu_assist() {
5107
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
5108
-
5109
- if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
5110
- return;
5111
- }
5112
-
5113
- ggml_backend_vk_free(vk_instance.backends[0]);
5114
- }
5115
-
5116
5101
  // backend interface
5117
5102
 
5118
5103
  #define UNUSED GGML_UNUSED
@@ -5324,16 +5309,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
5324
5309
  /* .is_host = */ NULL,
5325
5310
  };
5326
5311
 
5327
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t idx) {
5312
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
5328
5313
  #ifdef GGML_VULKAN_DEBUG
5329
- std::cerr << "ggml_backend_vk_buffer_type(" << idx << ")" << std::endl;
5314
+ std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
5330
5315
  #endif
5331
5316
 
5332
- GGML_ASSERT(idx < vk_instance.device_indices.size());
5317
+ GGML_ASSERT(dev_num < vk_instance.device_indices.size());
5333
5318
 
5334
- ggml_backend_vk_init(idx);
5319
+ ggml_backend_vk_init(dev_num);
5335
5320
 
5336
- return &vk_instance.buffer_types[idx];
5321
+ return &vk_instance.buffer_types[dev_num];
5337
5322
  }
5338
5323
 
5339
5324
  // host buffer type
@@ -5502,7 +5487,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
5502
5487
  vk_buffer src_buf = src_extra->buffer_gpu.lock();
5503
5488
  vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
5504
5489
 
5505
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, src_buf, src_extra->offset, dst_buf, dst_extra->offset, ggml_nbytes(src));
5490
+ ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
5506
5491
  return true;
5507
5492
  }
5508
5493
 
@@ -5536,6 +5521,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
5536
5521
  }
5537
5522
 
5538
5523
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
5524
+ #ifdef GGML_VULKAN_DEBUG
5525
+ std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
5526
+ #endif
5539
5527
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
5540
5528
 
5541
5529
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -5560,7 +5548,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
5560
5548
  for (int i = 0; i < cgraph->n_nodes; i++) {
5561
5549
  ggml_tensor * node = cgraph->nodes[i];
5562
5550
 
5563
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
5551
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
5564
5552
  continue;
5565
5553
  }
5566
5554
 
@@ -5596,8 +5584,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5596
5584
  }
5597
5585
  break;
5598
5586
  case GGML_OP_MUL_MAT:
5599
- case GGML_OP_MUL_MAT_ID:
5587
+ // case GGML_OP_MUL_MAT_ID:
5600
5588
  {
5589
+ switch (op->src[0]->type) {
5590
+ case GGML_TYPE_F32:
5591
+ case GGML_TYPE_F16:
5592
+ case GGML_TYPE_Q4_0:
5593
+ case GGML_TYPE_Q4_1:
5594
+ case GGML_TYPE_Q5_0:
5595
+ case GGML_TYPE_Q5_1:
5596
+ case GGML_TYPE_Q8_0:
5597
+ case GGML_TYPE_Q2_K:
5598
+ case GGML_TYPE_Q3_K:
5599
+ case GGML_TYPE_Q4_K:
5600
+ case GGML_TYPE_Q5_K:
5601
+ case GGML_TYPE_Q6_K:
5602
+ break;
5603
+ default:
5604
+ return false;
5605
+ }
5601
5606
  struct ggml_tensor * a;
5602
5607
  struct ggml_tensor * b;
5603
5608
  if (op->op == GGML_OP_MUL_MAT) {
@@ -5612,25 +5617,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5612
5617
  }
5613
5618
  return true;
5614
5619
  } break;
5615
- // case GGML_OP_GET_ROWS:
5616
- // {
5617
- // switch (op->src[0]->type) {
5618
- // case GGML_TYPE_F16:
5619
- // case GGML_TYPE_F32:
5620
- // case GGML_TYPE_Q4_0:
5621
- // case GGML_TYPE_Q4_1:
5622
- // case GGML_TYPE_Q5_0:
5623
- // case GGML_TYPE_Q5_1:
5624
- // case GGML_TYPE_Q8_0:
5625
- // return true;
5626
- // default:
5627
- // return false;
5628
- // }
5629
- // } break;
5620
+ case GGML_OP_GET_ROWS:
5621
+ {
5622
+ switch (op->src[0]->type) {
5623
+ case GGML_TYPE_F32:
5624
+ case GGML_TYPE_F16:
5625
+ case GGML_TYPE_Q4_0:
5626
+ case GGML_TYPE_Q4_1:
5627
+ case GGML_TYPE_Q5_0:
5628
+ case GGML_TYPE_Q5_1:
5629
+ case GGML_TYPE_Q8_0:
5630
+ return true;
5631
+ default:
5632
+ return false;
5633
+ }
5634
+ } break;
5630
5635
  case GGML_OP_CPY:
5636
+ case GGML_OP_DUP:
5631
5637
  {
5632
5638
  ggml_type src0_type = op->src[0]->type;
5633
- ggml_type src1_type = op->src[1]->type;
5639
+ ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
5634
5640
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
5635
5641
  return true;
5636
5642
  }
@@ -5642,7 +5648,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5642
5648
  }
5643
5649
  return false;
5644
5650
  } break;
5645
- case GGML_OP_DUP:
5646
5651
  // case GGML_OP_REPEAT:
5647
5652
  // {
5648
5653
  // ggml_type src0_type = op->src[0]->type;
@@ -5679,6 +5684,20 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
5679
5684
  UNUSED(backend);
5680
5685
  }
5681
5686
 
5687
+ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
5688
+ const ggml_tensor * dst = op;
5689
+
5690
+ const int min_batch_size = 32;
5691
+
5692
+ if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
5693
+ return true;
5694
+ }
5695
+
5696
+ return false;
5697
+
5698
+ UNUSED(backend);
5699
+ }
5700
+
5682
5701
  // TODO: enable async and synchronize
5683
5702
  static ggml_backend_i ggml_backend_vk_interface = {
5684
5703
  /* .get_name = */ ggml_backend_vk_name,
@@ -5693,6 +5712,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
5693
5712
  /* .graph_plan_compute = */ NULL,
5694
5713
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
5695
5714
  /* .supports_op = */ ggml_backend_vk_supports_op,
5715
+ /* .offload_op = */ ggml_backend_vk_offload_op,
5696
5716
  /* .event_new = */ NULL,
5697
5717
  /* .event_free = */ NULL,
5698
5718
  /* .event_record = */ NULL,
@@ -5705,22 +5725,22 @@ static ggml_guid_t ggml_backend_vk_guid() {
5705
5725
  return &guid;
5706
5726
  }
5707
5727
 
5708
- GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5709
- if (vk_instance.initialized[idx]) {
5710
- return vk_instance.backends[idx];
5728
+ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
5729
+ if (vk_instance.initialized[dev_num]) {
5730
+ return vk_instance.backends[dev_num];
5711
5731
  }
5712
5732
  #ifdef GGML_VULKAN_DEBUG
5713
- std::cerr << "ggml_backend_vk_init(" << idx << ")" << std::endl;
5733
+ std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
5714
5734
  #endif
5715
5735
 
5716
- ggml_backend_vk_context * ctx = &vk_instance.contexts[idx];
5717
- ggml_vk_init(ctx, idx);
5718
- ctx->name = GGML_VK_NAME + std::to_string(idx);
5719
- vk_instance.buffer_types[idx] = {
5736
+ ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
5737
+ ggml_vk_init(ctx, dev_num);
5738
+ ctx->name = GGML_VK_NAME + std::to_string(dev_num);
5739
+ vk_instance.buffer_types[dev_num] = {
5720
5740
  /* .iface = */ ggml_backend_vk_buffer_type_interface,
5721
5741
  /* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
5722
5742
  };
5723
- vk_instance.initialized[idx] = true;
5743
+ vk_instance.initialized[dev_num] = true;
5724
5744
 
5725
5745
  ggml_backend_t vk_backend = new ggml_backend {
5726
5746
  /* .guid = */ ggml_backend_vk_guid(),
@@ -5728,7 +5748,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
5728
5748
  /* .context = */ &vk_instance.contexts[ctx->idx],
5729
5749
  };
5730
5750
 
5731
- vk_instance.backends[idx] = vk_backend;
5751
+ vk_instance.backends[dev_num] = vk_backend;
5732
5752
 
5733
5753
  return vk_backend;
5734
5754
  }
@@ -5772,10 +5792,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo
5772
5792
  extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
5773
5793
 
5774
5794
  GGML_CALL int ggml_backend_vk_reg_devices() {
5775
- for (auto idx : vk_instance.device_indices) {
5795
+ ggml_vk_instance_init();
5796
+
5797
+ for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
5776
5798
  char name[128];
5777
- snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, idx);
5778
- ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(idx), (void *) (intptr_t) idx);
5799
+ snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
5800
+ ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
5779
5801
  }
5780
5802
  return vk_instance.device_indices.size();
5781
5803
  }
@@ -5859,6 +5881,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
5859
5881
  val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
5860
5882
  } else if (tensor->type == GGML_TYPE_F16) {
5861
5883
  val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
5884
+ } else {
5885
+ GGML_ASSERT(false);
5862
5886
  }
5863
5887
  fprintf(stderr, "% 7.2f ", val);
5864
5888
  } else {
@@ -5953,6 +5977,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
5953
5977
  return;
5954
5978
  }
5955
5979
 
5980
+ #ifdef GGML_VULKAN_DEBUG
5981
+ std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
5982
+ #endif
5983
+
5956
5984
  ggml_tensor * src0 = tensor->src[0];
5957
5985
  ggml_tensor * src1 = tensor->src[1];
5958
5986
  ggml_tensor * src2 = tensor->src[2];
@@ -6212,6 +6240,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6212
6240
  tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
6213
6241
  } else if (tensor->op == GGML_OP_TRANSPOSE) {
6214
6242
  tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
6243
+ } else if (tensor->op == GGML_OP_GET_ROWS) {
6244
+ tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
6215
6245
  } else {
6216
6246
  std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6217
6247
  GGML_ASSERT(false);
@@ -6262,6 +6292,10 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6262
6292
  return;
6263
6293
  }
6264
6294
 
6295
+ #ifdef GGML_VULKAN_DEBUG
6296
+ std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
6297
+ #endif
6298
+
6265
6299
  ggml_tensor * src0 = tensor->src[0];
6266
6300
  ggml_tensor * src1 = tensor->src[1];
6267
6301
 
@@ -6405,10 +6439,4 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6405
6439
  free(tensor_data);
6406
6440
  }
6407
6441
  }
6408
-
6409
- void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
6410
- ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
6411
-
6412
- ggml_vk_check_results_0(ctx, params, tensor);
6413
- }
6414
6442
  #endif