llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +16 -0
 - data/examples/chat.rb +2 -4
 - data/ext/llama_cpp/extconf.rb +1 -0
 - data/ext/llama_cpp/llama_cpp.cpp +27 -0
 - data/lib/llama_cpp/version.rb +2 -2
 - data/sig/llama_cpp.rbs +14 -0
 - data/vendor/tmp/llama.cpp/LICENSE +1 -1
 - data/vendor/tmp/llama.cpp/Makefile +81 -20
 - data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
 - data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
 - data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
 - data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
 - data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
 - data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
 - data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
 - data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
 - data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
 - data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
 - data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
 - data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
 - data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
 - data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
 - data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
 - data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
 - data/vendor/tmp/llama.cpp/ggml.c +141 -101
 - data/vendor/tmp/llama.cpp/ggml.h +18 -12
 - data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
 - data/vendor/tmp/llama.cpp/llama.h +145 -29
 - data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
 - data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
 - data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
 - data/vendor/tmp/llama.cpp/unicode.h +2 -0
 - metadata +5 -3
 
| 
         @@ -9,7 +9,6 @@ 
     | 
|
| 
       9 
9 
     | 
    
         
             
            #include <algorithm>
         
     | 
| 
       10 
10 
     | 
    
         
             
            #include <cmath>
         
     | 
| 
       11 
11 
     | 
    
         
             
            #include <iostream>
         
     | 
| 
       12 
     | 
    
         
            -
            #include <iomanip>
         
     | 
| 
       13 
12 
     | 
    
         
             
            #include <limits>
         
     | 
| 
       14 
13 
     | 
    
         
             
            #include <tuple>
         
     | 
| 
       15 
14 
     | 
    
         
             
            #include <vector>
         
     | 
| 
         @@ -340,8 +339,8 @@ struct ggml_backend_vk_context { 
     | 
|
| 
       340 
339 
     | 
    
         
             
                size_t semaphore_idx, event_idx;
         
     | 
| 
       341 
340 
     | 
    
         
             
                ggml_vk_garbage_collector gc;
         
     | 
| 
       342 
341 
     | 
    
         
             
                std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
         
     | 
| 
       343 
     | 
    
         
            -
                size_t  
     | 
| 
       344 
     | 
    
         
            -
                vk_buffer  
     | 
| 
      
 342 
     | 
    
         
            +
                size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
         
     | 
| 
      
 343 
     | 
    
         
            +
                vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
         
     | 
| 
       345 
344 
     | 
    
         
             
                vk::Fence fence;
         
     | 
| 
       346 
345 
     | 
    
         
             
                vk_buffer staging;
         
     | 
| 
       347 
346 
     | 
    
         
             
                size_t staging_size;
         
     | 
| 
         @@ -809,7 +808,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr 
     | 
|
| 
       809 
808 
     | 
    
         | 
| 
       810 
809 
     | 
    
         
             
            static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
         
     | 
| 
       811 
810 
     | 
    
         
             
            #ifdef GGML_VULKAN_DEBUG
         
     | 
| 
       812 
     | 
    
         
            -
                std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
         
     | 
| 
      
 811 
     | 
    
         
            +
                std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
         
     | 
| 
       813 
812 
     | 
    
         
             
            #endif
         
     | 
| 
       814 
813 
     | 
    
         
             
                vk_buffer buf = std::make_shared<vk_buffer_struct>();
         
     | 
| 
       815 
814 
     | 
    
         | 
| 
         @@ -998,6 +997,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { 
     | 
|
| 
       998 
997 
     | 
    
         
             
                ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
         
     | 
| 
       999 
998 
     | 
    
         
             
                ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
         
     | 
| 
       1000 
999 
     | 
    
         
             
                ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
         
     | 
| 
      
 1000 
     | 
    
         
            +
                ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
         
     | 
| 
      
 1001 
     | 
    
         
            +
                ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
         
     | 
| 
      
 1002 
     | 
    
         
            +
                ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
         
     | 
| 
      
 1003 
     | 
    
         
            +
                ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
         
     | 
| 
      
 1004 
     | 
    
         
            +
                ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
         
     | 
| 
       1001 
1005 
     | 
    
         | 
| 
       1002 
1006 
     | 
    
         
             
                if (device->fp16) {
         
     | 
| 
       1003 
1007 
     | 
    
         
             
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
         
     | 
| 
         @@ -1055,6 +1059,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { 
     | 
|
| 
       1055 
1059 
     | 
    
         
             
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
       1056 
1060 
     | 
    
         
             
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
       1057 
1061 
     | 
    
         
             
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1062 
     | 
    
         
            +
             
     | 
| 
      
 1063 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1064 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1065 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1066 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1067 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1068 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1069 
     | 
    
         
            +
             
     | 
| 
      
 1070 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1071 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1072 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1073 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1074 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1075 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1076 
     | 
    
         
            +
             
     | 
| 
      
 1077 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1078 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1079 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1080 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1081 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1082 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1083 
     | 
    
         
            +
             
     | 
| 
      
 1084 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1085 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1086 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1087 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1088 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1089 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1090 
     | 
    
         
            +
             
     | 
| 
      
 1091 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1092 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1093 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1094 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1095 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1096 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
       1058 
1097 
     | 
    
         
             
                } else {
         
     | 
| 
       1059 
1098 
     | 
    
         
             
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
         
     | 
| 
       1060 
1099 
     | 
    
         
             
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
         
     | 
| 
         @@ -1111,6 +1150,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { 
     | 
|
| 
       1111 
1150 
     | 
    
         
             
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
       1112 
1151 
     | 
    
         
             
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
       1113 
1152 
     | 
    
         
             
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1153 
     | 
    
         
            +
             
     | 
| 
      
 1154 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1155 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1156 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1157 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1158 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1159 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1160 
     | 
    
         
            +
             
     | 
| 
      
 1161 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1162 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1163 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1164 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1165 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1166 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1167 
     | 
    
         
            +
             
     | 
| 
      
 1168 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1169 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1170 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1171 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1172 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1173 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1174 
     | 
    
         
            +
             
     | 
| 
      
 1175 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1176 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1177 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1178 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1179 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1180 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1181 
     | 
    
         
            +
             
     | 
| 
      
 1182 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1183 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1184 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
      
 1185 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
         
     | 
| 
      
 1186 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
         
     | 
| 
      
 1187 
     | 
    
         
            +
                    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
         
     | 
| 
       1114 
1188 
     | 
    
         
             
                }
         
     | 
| 
       1115 
1189 
     | 
    
         | 
| 
       1116 
1190 
     | 
    
         
             
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32",  mul_mat_vec_f16_f32_len,  mul_mat_vec_f16_f32_data,  "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
         
     | 
| 
         @@ -1139,19 +1213,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { 
     | 
|
| 
       1139 
1213 
     | 
    
         
             
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
         
     | 
| 
       1140 
1214 
     | 
    
         | 
| 
       1141 
1215 
     | 
    
         
             
                // get_rows
         
     | 
| 
       1142 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[ 
     | 
| 
       1143 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[ 
     | 
| 
       1144 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[ 
     | 
| 
       1145 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[ 
     | 
| 
       1146 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[ 
     | 
| 
       1147 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[ 
     | 
| 
       1148 
     | 
    
         
            -
             
     | 
| 
       1149 
     | 
    
         
            -
             
     | 
| 
       1150 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[ 
     | 
| 
       1151 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[ 
     | 
| 
       1152 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[ 
     | 
| 
       1153 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[ 
     | 
| 
       1154 
     | 
    
         
            -
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[ 
     | 
| 
      
 1216 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32",  get_rows_f32_len,  get_rows_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
         
     | 
| 
      
 1217 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16",  get_rows_f16_len,  get_rows_f16_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
         
     | 
| 
      
 1218 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
         
     | 
| 
      
 1219 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
         
     | 
| 
      
 1220 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
         
     | 
| 
      
 1221 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
         
     | 
| 
      
 1222 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
         
     | 
| 
      
 1223 
     | 
    
         
            +
             
     | 
| 
      
 1224 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32",  get_rows_f32_f32_len,  get_rows_f32_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
         
     | 
| 
      
 1225 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
         
     | 
| 
      
 1226 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
         
     | 
| 
      
 1227 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
         
     | 
| 
      
 1228 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
         
     | 
| 
      
 1229 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
         
     | 
| 
      
 1230 
     | 
    
         
            +
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
         
     | 
| 
       1155 
1231 
     | 
    
         | 
| 
       1156 
1232 
     | 
    
         
             
                ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
         
     | 
| 
       1157 
1233 
     | 
    
         | 
| 
         @@ -1341,7 +1417,33 @@ void ggml_vk_instance_init() { 
     | 
|
| 
       1341 
1417 
     | 
    
         
             
                        vk_instance.device_indices.push_back(tmp);
         
     | 
| 
       1342 
1418 
     | 
    
         
             
                    }
         
     | 
| 
       1343 
1419 
     | 
    
         
             
                } else {
         
     | 
| 
       1344 
     | 
    
         
            -
                    vk_instance. 
     | 
| 
      
 1420 
     | 
    
         
            +
                    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
         
     | 
| 
      
 1421 
     | 
    
         
            +
             
     | 
| 
      
 1422 
     | 
    
         
            +
                    // Make sure at least one device exists
         
     | 
| 
      
 1423 
     | 
    
         
            +
                    if (devices.empty()) {
         
     | 
| 
      
 1424 
     | 
    
         
            +
                        std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
         
     | 
| 
      
 1425 
     | 
    
         
            +
                        GGML_ASSERT(false);
         
     | 
| 
      
 1426 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1427 
     | 
    
         
            +
             
     | 
| 
      
 1428 
     | 
    
         
            +
                    // Default to using all dedicated GPUs
         
     | 
| 
      
 1429 
     | 
    
         
            +
                    for (size_t i = 0; i < devices.size(); i++) {
         
     | 
| 
      
 1430 
     | 
    
         
            +
                        vk::PhysicalDeviceProperties props = devices[i].getProperties();
         
     | 
| 
      
 1431 
     | 
    
         
            +
             
     | 
| 
      
 1432 
     | 
    
         
            +
                        if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
         
     | 
| 
      
 1433 
     | 
    
         
            +
                            vk_instance.device_indices.push_back(i);
         
     | 
| 
      
 1434 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1435 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1436 
     | 
    
         
            +
             
     | 
| 
      
 1437 
     | 
    
         
            +
                    // If no dedicated GPUs found, fall back to GPU 0
         
     | 
| 
      
 1438 
     | 
    
         
            +
                    if (vk_instance.device_indices.empty()) {
         
     | 
| 
      
 1439 
     | 
    
         
            +
                        vk_instance.device_indices.push_back(0);
         
     | 
| 
      
 1440 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1441 
     | 
    
         
            +
                }
         
     | 
| 
      
 1442 
     | 
    
         
            +
             
     | 
| 
      
 1443 
     | 
    
         
            +
                std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
         
     | 
| 
      
 1444 
     | 
    
         
            +
             
     | 
| 
      
 1445 
     | 
    
         
            +
                for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
         
     | 
| 
      
 1446 
     | 
    
         
            +
                    ggml_vk_print_gpu_info(i);
         
     | 
| 
       1345 
1447 
     | 
    
         
             
                }
         
     | 
| 
       1346 
1448 
     | 
    
         | 
| 
       1347 
1449 
     | 
    
         
             
                vk_instance_initialized = true;
         
     | 
| 
         @@ -1567,6 +1669,15 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte 
     | 
|
| 
       1567 
1669 
     | 
    
         | 
| 
       1568 
1670 
     | 
    
         
             
                switch (src0_type) {
         
     | 
| 
       1569 
1671 
     | 
    
         
             
                    case GGML_TYPE_Q4_0:
         
     | 
| 
      
 1672 
     | 
    
         
            +
                    case GGML_TYPE_Q4_1:
         
     | 
| 
      
 1673 
     | 
    
         
            +
                    case GGML_TYPE_Q5_0:
         
     | 
| 
      
 1674 
     | 
    
         
            +
                    case GGML_TYPE_Q5_1:
         
     | 
| 
      
 1675 
     | 
    
         
            +
                    case GGML_TYPE_Q8_0:
         
     | 
| 
      
 1676 
     | 
    
         
            +
                    case GGML_TYPE_Q2_K:
         
     | 
| 
      
 1677 
     | 
    
         
            +
                    case GGML_TYPE_Q3_K:
         
     | 
| 
      
 1678 
     | 
    
         
            +
                    case GGML_TYPE_Q4_K:
         
     | 
| 
      
 1679 
     | 
    
         
            +
                    case GGML_TYPE_Q5_K:
         
     | 
| 
      
 1680 
     | 
    
         
            +
                    case GGML_TYPE_Q6_K:
         
     | 
| 
       1570 
1681 
     | 
    
         
             
                        break;
         
     | 
| 
       1571 
1682 
     | 
    
         
             
                    default:
         
     | 
| 
       1572 
1683 
     | 
    
         
             
                        return nullptr;
         
     | 
| 
         @@ -2034,7 +2145,6 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds 
     | 
|
| 
       2034 
2145 
     | 
    
         
             
                    ggml_vk_submit(subctx, ctx->fence);
         
     | 
| 
       2035 
2146 
     | 
    
         
             
                    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
         
     | 
| 
       2036 
2147 
     | 
    
         
             
                    ctx->device->device.resetFences({ ctx->fence });
         
     | 
| 
       2037 
     | 
    
         
            -
                    ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
         
     | 
| 
       2038 
2148 
     | 
    
         
             
                }
         
     | 
| 
       2039 
2149 
     | 
    
         
             
            }
         
     | 
| 
       2040 
2150 
     | 
    
         | 
| 
         @@ -2131,7 +2241,6 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s 
     | 
|
| 
       2131 
2241 
     | 
    
         
             
                    for (auto& cpy : subctx->out_memcpys) {
         
     | 
| 
       2132 
2242 
     | 
    
         
             
                        memcpy(cpy.dst, cpy.src, cpy.n);
         
     | 
| 
       2133 
2243 
     | 
    
         
             
                    }
         
     | 
| 
       2134 
     | 
    
         
            -
                    ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
         
     | 
| 
       2135 
2244 
     | 
    
         
             
                }
         
     | 
| 
       2136 
2245 
     | 
    
         
             
            }
         
     | 
| 
       2137 
2246 
     | 
    
         | 
| 
         @@ -2298,6 +2407,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, 
     | 
|
| 
       2298 
2407 
     | 
    
         
             
                    return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
         
     | 
| 
       2299 
2408 
     | 
    
         
             
                case VK_VENDOR_ID_INTEL:
         
     | 
| 
       2300 
2409 
     | 
    
         
             
                    return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
         
     | 
| 
      
 2410 
     | 
    
         
            +
                default:
         
     | 
| 
      
 2411 
     | 
    
         
            +
                    break;
         
     | 
| 
       2301 
2412 
     | 
    
         
             
                }
         
     | 
| 
       2302 
2413 
     | 
    
         | 
| 
       2303 
2414 
     | 
    
         
             
                if (m <= 32 || n <= 32) {
         
     | 
| 
         @@ -2423,11 +2534,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su 
     | 
|
| 
       2423 
2534 
     | 
    
         
             
                    src1_uma = d_Qy != nullptr;
         
     | 
| 
       2424 
2535 
     | 
    
         
             
                }
         
     | 
| 
       2425 
2536 
     | 
    
         | 
| 
       2426 
     | 
    
         
            -
                const bool  
     | 
| 
       2427 
     | 
    
         
            -
                const bool  
     | 
| 
       2428 
     | 
    
         
            -
             
     | 
| 
       2429 
     | 
    
         
            -
                const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
         
     | 
| 
       2430 
     | 
    
         
            -
                const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
         
     | 
| 
      
 2537 
     | 
    
         
            +
                const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
         
     | 
| 
      
 2538 
     | 
    
         
            +
                const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
         
     | 
| 
       2431 
2539 
     | 
    
         | 
| 
       2432 
2540 
     | 
    
         
             
                const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
         
     | 
| 
       2433 
2541 
     | 
    
         | 
| 
         @@ -2469,16 +2577,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su 
     | 
|
| 
       2469 
2577 
     | 
    
         
             
                uint64_t x_buf_offset = 0;
         
     | 
| 
       2470 
2578 
     | 
    
         
             
                vk_buffer d_Y;
         
     | 
| 
       2471 
2579 
     | 
    
         
             
                uint64_t y_buf_offset = 0;
         
     | 
| 
       2472 
     | 
    
         
            -
                if ( 
     | 
| 
       2473 
     | 
    
         
            -
                    d_Qx = ctx->prealloc_qx;
         
     | 
| 
       2474 
     | 
    
         
            -
                } else if (!src0_uma) {
         
     | 
| 
      
 2580 
     | 
    
         
            +
                if (!src0_uma) {
         
     | 
| 
       2475 
2581 
     | 
    
         
             
                    d_Qx = extra_src0->buffer_gpu.lock();
         
     | 
| 
       2476 
2582 
     | 
    
         
             
                    qx_buf_offset = extra_src0->offset;
         
     | 
| 
       2477 
2583 
     | 
    
         
             
                    GGML_ASSERT(d_Qx != nullptr);
         
     | 
| 
       2478 
2584 
     | 
    
         
             
                }
         
     | 
| 
       2479 
     | 
    
         
            -
                if ( 
     | 
| 
       2480 
     | 
    
         
            -
                    d_Qy = ctx->prealloc_qy;
         
     | 
| 
       2481 
     | 
    
         
            -
                } else if (!src1_uma) {
         
     | 
| 
      
 2585 
     | 
    
         
            +
                if (!src1_uma) {
         
     | 
| 
       2482 
2586 
     | 
    
         
             
                    d_Qy = extra_src1->buffer_gpu.lock();
         
     | 
| 
       2483 
2587 
     | 
    
         
             
                    qy_buf_offset = extra_src1->offset;
         
     | 
| 
       2484 
2588 
     | 
    
         
             
                    GGML_ASSERT(d_Qy != nullptr);
         
     | 
| 
         @@ -2530,33 +2634,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su 
     | 
|
| 
       2530 
2634 
     | 
    
         | 
| 
       2531 
2635 
     | 
    
         
             
                if (x_non_contig) {
         
     | 
| 
       2532 
2636 
     | 
    
         
             
                    ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
         
     | 
| 
       2533 
     | 
    
         
            -
                } else if ( 
     | 
| 
       2534 
     | 
    
         
            -
                     
     | 
| 
       2535 
     | 
    
         
            -
             
     | 
| 
       2536 
     | 
    
         
            -
             
     | 
| 
       2537 
     | 
    
         
            -
                        ctx->staging_offset = qx_sz * ne02 * ne03;
         
     | 
| 
       2538 
     | 
    
         
            -
                    }
         
     | 
| 
       2539 
     | 
    
         
            -
             
     | 
| 
       2540 
     | 
    
         
            -
                    if (qx_needs_dequant) {
         
     | 
| 
       2541 
     | 
    
         
            -
                        const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
         
     | 
| 
       2542 
     | 
    
         
            -
                        ggml_vk_sync_buffers(subctx);
         
     | 
| 
       2543 
     | 
    
         
            -
                        ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
         
     | 
| 
       2544 
     | 
    
         
            -
                    }
         
     | 
| 
      
 2637 
     | 
    
         
            +
                } else if (qx_needs_dequant) {
         
     | 
| 
      
 2638 
     | 
    
         
            +
                    const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
         
     | 
| 
      
 2639 
     | 
    
         
            +
                    ggml_vk_sync_buffers(subctx);
         
     | 
| 
      
 2640 
     | 
    
         
            +
                    ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
         
     | 
| 
       2545 
2641 
     | 
    
         
             
                }
         
     | 
| 
       2546 
2642 
     | 
    
         
             
                if (y_non_contig) {
         
     | 
| 
       2547 
2643 
     | 
    
         
             
                    ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
         
     | 
| 
       2548 
     | 
    
         
            -
                } else if (load_y) {
         
     | 
| 
       2549 
     | 
    
         
            -
                    ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
         
     | 
| 
       2550 
2644 
     | 
    
         
             
                }
         
     | 
| 
       2551 
2645 
     | 
    
         | 
| 
       2552 
2646 
     | 
    
         
             
                uint32_t stride_batch_x = ne00*ne01;
         
     | 
| 
       2553 
2647 
     | 
    
         
             
                uint32_t stride_batch_y = ne10*ne11;
         
     | 
| 
       2554 
2648 
     | 
    
         | 
| 
       2555 
     | 
    
         
            -
                if (!ggml_vk_dim01_contiguous(src0) && ! 
     | 
| 
      
 2649 
     | 
    
         
            +
                if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
         
     | 
| 
       2556 
2650 
     | 
    
         
             
                    stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
         
     | 
| 
       2557 
2651 
     | 
    
         
             
                }
         
     | 
| 
       2558 
2652 
     | 
    
         | 
| 
       2559 
     | 
    
         
            -
                if (!ggml_vk_dim01_contiguous(src1) && ! 
     | 
| 
      
 2653 
     | 
    
         
            +
                if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
         
     | 
| 
       2560 
2654 
     | 
    
         
             
                    stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
         
     | 
| 
       2561 
2655 
     | 
    
         
             
                }
         
     | 
| 
       2562 
2656 
     | 
    
         | 
| 
         @@ -2616,11 +2710,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context 
     | 
|
| 
       2616 
2710 
     | 
    
         
             
                    src1_uma = d_Qy != nullptr;
         
     | 
| 
       2617 
2711 
     | 
    
         
             
                }
         
     | 
| 
       2618 
2712 
     | 
    
         | 
| 
       2619 
     | 
    
         
            -
                const bool  
     | 
| 
       2620 
     | 
    
         
            -
                const bool  
     | 
| 
       2621 
     | 
    
         
            -
             
     | 
| 
       2622 
     | 
    
         
            -
                const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
         
     | 
| 
       2623 
     | 
    
         
            -
                const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
         
     | 
| 
      
 2713 
     | 
    
         
            +
                const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
         
     | 
| 
      
 2714 
     | 
    
         
            +
                const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
         
     | 
| 
       2624 
2715 
     | 
    
         | 
| 
       2625 
2716 
     | 
    
         
             
                const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
         
     | 
| 
       2626 
2717 
     | 
    
         | 
| 
         @@ -2644,16 +2735,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context 
     | 
|
| 
       2644 
2735 
     | 
    
         
             
                uint64_t x_buf_offset = 0;
         
     | 
| 
       2645 
2736 
     | 
    
         
             
                vk_buffer d_Y;
         
     | 
| 
       2646 
2737 
     | 
    
         
             
                uint64_t y_buf_offset = 0;
         
     | 
| 
       2647 
     | 
    
         
            -
                if 
     | 
| 
       2648 
     | 
    
         
            -
                    d_Qx = ctx->prealloc_qx;
         
     | 
| 
       2649 
     | 
    
         
            -
                } else if(!src1_uma) {
         
     | 
| 
      
 2738 
     | 
    
         
            +
                if(!src0_uma) {
         
     | 
| 
       2650 
2739 
     | 
    
         
             
                    d_Qx = extra_src0->buffer_gpu.lock();
         
     | 
| 
       2651 
2740 
     | 
    
         
             
                    qx_buf_offset = extra_src0->offset;
         
     | 
| 
       2652 
2741 
     | 
    
         
             
                    GGML_ASSERT(d_Qx != nullptr);
         
     | 
| 
       2653 
2742 
     | 
    
         
             
                }
         
     | 
| 
       2654 
     | 
    
         
            -
                if 
     | 
| 
       2655 
     | 
    
         
            -
                    d_Qy = ctx->prealloc_qy;
         
     | 
| 
       2656 
     | 
    
         
            -
                } else if(!src1_uma) {
         
     | 
| 
      
 2743 
     | 
    
         
            +
                if(!src1_uma) {
         
     | 
| 
       2657 
2744 
     | 
    
         
             
                    d_Qy = extra_src1->buffer_gpu.lock();
         
     | 
| 
       2658 
2745 
     | 
    
         
             
                    qy_buf_offset = extra_src1->offset;
         
     | 
| 
       2659 
2746 
     | 
    
         
             
                    GGML_ASSERT(d_Qy != nullptr);
         
     | 
| 
         @@ -2700,15 +2787,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context 
     | 
|
| 
       2700 
2787 
     | 
    
         
             
                if (x_non_contig) {
         
     | 
| 
       2701 
2788 
     | 
    
         
             
                    GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
         
     | 
| 
       2702 
2789 
     | 
    
         
             
                    ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
         
     | 
| 
       2703 
     | 
    
         
            -
                } else if (load_x) {
         
     | 
| 
       2704 
     | 
    
         
            -
                    // copy data to device
         
     | 
| 
       2705 
     | 
    
         
            -
                    ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
         
     | 
| 
       2706 
2790 
     | 
    
         
             
                }
         
     | 
| 
       2707 
2791 
     | 
    
         
             
                if (y_non_contig) {
         
     | 
| 
       2708 
2792 
     | 
    
         
             
                    GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
         
     | 
| 
       2709 
2793 
     | 
    
         
             
                    ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
         
     | 
| 
       2710 
     | 
    
         
            -
                } else if (load_y) {
         
     | 
| 
       2711 
     | 
    
         
            -
                    ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
         
     | 
| 
       2712 
2794 
     | 
    
         
             
                }
         
     | 
| 
       2713 
2795 
     | 
    
         | 
| 
       2714 
2796 
     | 
    
         
             
                for (uint64_t i13 = 0; i13 < ne13; i13++) {
         
     | 
| 
         @@ -2789,8 +2871,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c 
     | 
|
| 
       2789 
2871 
     | 
    
         
             
                    src1_uma = d_Qy != nullptr;
         
     | 
| 
       2790 
2872 
     | 
    
         
             
                }
         
     | 
| 
       2791 
2873 
     | 
    
         | 
| 
       2792 
     | 
    
         
            -
                const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
         
     | 
| 
       2793 
     | 
    
         
            -
             
     | 
| 
       2794 
2874 
     | 
    
         
             
                const uint64_t x_ne = ne00 * ne01 * ne02;
         
     | 
| 
       2795 
2875 
     | 
    
         
             
                const uint64_t y_ne = ne10 * ne11 * ne12;
         
     | 
| 
       2796 
2876 
     | 
    
         
             
                const uint64_t d_ne = ne01 * ne11 * ne12;
         
     | 
| 
         @@ -2805,9 +2885,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c 
     | 
|
| 
       2805 
2885 
     | 
    
         
             
                vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
         
     | 
| 
       2806 
2886 
     | 
    
         
             
                const uint64_t qx_buf_offset = extra_src0->offset;
         
     | 
| 
       2807 
2887 
     | 
    
         
             
                GGML_ASSERT(d_Qx != nullptr);
         
     | 
| 
       2808 
     | 
    
         
            -
                if ( 
     | 
| 
       2809 
     | 
    
         
            -
                    d_Qy = ctx->prealloc_qy;
         
     | 
| 
       2810 
     | 
    
         
            -
                } else if (!src1_uma) {
         
     | 
| 
      
 2888 
     | 
    
         
            +
                if (!src1_uma) {
         
     | 
| 
       2811 
2889 
     | 
    
         
             
                    d_Qy = extra_src1->buffer_gpu.lock();
         
     | 
| 
       2812 
2890 
     | 
    
         
             
                    qy_buf_offset = extra_src1->offset;
         
     | 
| 
       2813 
2891 
     | 
    
         
             
                    GGML_ASSERT(d_Qx != nullptr);
         
     | 
| 
         @@ -2822,10 +2900,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c 
     | 
|
| 
       2822 
2900 
     | 
    
         
             
                const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
         
     | 
| 
       2823 
2901 
     | 
    
         
             
                const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
         
     | 
| 
       2824 
2902 
     | 
    
         | 
| 
       2825 
     | 
    
         
            -
                if (load_y) {
         
     | 
| 
       2826 
     | 
    
         
            -
                    ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
         
     | 
| 
       2827 
     | 
    
         
            -
                }
         
     | 
| 
       2828 
     | 
    
         
            -
             
     | 
| 
       2829 
2903 
     | 
    
         
             
                // compute
         
     | 
| 
       2830 
2904 
     | 
    
         
             
                const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
         
     | 
| 
       2831 
2905 
     | 
    
         
             
                ggml_vk_sync_buffers(subctx);
         
     | 
| 
         @@ -2881,8 +2955,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con 
     | 
|
| 
       2881 
2955 
     | 
    
         
             
                    src1_uma = d_Qy != nullptr;
         
     | 
| 
       2882 
2956 
     | 
    
         
             
                }
         
     | 
| 
       2883 
2957 
     | 
    
         | 
| 
       2884 
     | 
    
         
            -
                const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
         
     | 
| 
       2885 
     | 
    
         
            -
             
     | 
| 
       2886 
2958 
     | 
    
         
             
                const uint64_t d_ne = ne01 * ne11 * ne12;
         
     | 
| 
       2887 
2959 
     | 
    
         | 
| 
       2888 
2960 
     | 
    
         
             
                const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
         
     | 
| 
         @@ -2898,9 +2970,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con 
     | 
|
| 
       2898 
2970 
     | 
    
         
             
                vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
         
     | 
| 
       2899 
2971 
     | 
    
         
             
                const uint64_t qx_buf_offset = extra_src0->offset;
         
     | 
| 
       2900 
2972 
     | 
    
         
             
                GGML_ASSERT(d_Qx != nullptr);
         
     | 
| 
       2901 
     | 
    
         
            -
                if ( 
     | 
| 
       2902 
     | 
    
         
            -
                    d_Qy = ctx->prealloc_qy;
         
     | 
| 
       2903 
     | 
    
         
            -
                } else {
         
     | 
| 
      
 2973 
     | 
    
         
            +
                if (!src1_uma) {
         
     | 
| 
       2904 
2974 
     | 
    
         
             
                    d_Qy = extra_src1->buffer_gpu.lock();
         
     | 
| 
       2905 
2975 
     | 
    
         
             
                    qy_buf_offset = extra_src1->offset;
         
     | 
| 
       2906 
2976 
     | 
    
         
             
                    GGML_ASSERT(d_Qx != nullptr);
         
     | 
| 
         @@ -2915,10 +2985,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con 
     | 
|
| 
       2915 
2985 
     | 
    
         
             
                const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
         
     | 
| 
       2916 
2986 
     | 
    
         
             
                const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
         
     | 
| 
       2917 
2987 
     | 
    
         | 
| 
       2918 
     | 
    
         
            -
                if (load_y) {
         
     | 
| 
       2919 
     | 
    
         
            -
                    ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
         
     | 
| 
       2920 
     | 
    
         
            -
                }
         
     | 
| 
       2921 
     | 
    
         
            -
             
     | 
| 
       2922 
2988 
     | 
    
         
             
                // compute
         
     | 
| 
       2923 
2989 
     | 
    
         
             
                const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
         
     | 
| 
       2924 
2990 
     | 
    
         
             
                ggml_vk_sync_buffers(subctx);
         
     | 
| 
         @@ -3174,7 +3240,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c 
     | 
|
| 
       3174 
3240 
     | 
    
         
             
                }
         
     | 
| 
       3175 
3241 
     | 
    
         
             
                std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
         
     | 
| 
       3176 
3242 
     | 
    
         
             
            #endif
         
     | 
| 
       3177 
     | 
    
         
            -
                GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)));  // NOLINT
         
     | 
| 
      
 3243 
     | 
    
         
            +
                GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))));  // NOLINT
         
     | 
| 
       3178 
3244 
     | 
    
         
             
                GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0));  // NOLINT
         
     | 
| 
       3179 
3245 
     | 
    
         
             
                GGML_ASSERT(dst->extra != nullptr);
         
     | 
| 
       3180 
3246 
     | 
    
         
             
                const uint64_t ne00 = src0->ne[0];
         
     | 
| 
         @@ -3242,11 +3308,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c 
     | 
|
| 
       3242 
3308 
     | 
    
         
             
                    }
         
     | 
| 
       3243 
3309 
     | 
    
         
             
                }
         
     | 
| 
       3244 
3310 
     | 
    
         | 
| 
       3245 
     | 
    
         
            -
                 
     | 
| 
       3246 
     | 
    
         
            -
                const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
         
     | 
| 
       3247 
     | 
    
         
            -
                const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma;
         
     | 
| 
       3248 
     | 
    
         
            -
             
     | 
| 
       3249 
     | 
    
         
            -
                uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
         
     | 
| 
      
 3311 
     | 
    
         
            +
                uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
         
     | 
| 
       3250 
3312 
     | 
    
         
             
                uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
         
     | 
| 
       3251 
3313 
     | 
    
         
             
                uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
         
     | 
| 
       3252 
3314 
     | 
    
         
             
                uint64_t d_sz = ggml_type_size(dst->type) * ne0;
         
     | 
| 
         @@ -3261,55 +3323,43 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c 
     | 
|
| 
       3261 
3323 
     | 
    
         
             
                GGML_ASSERT(d_D != nullptr);
         
     | 
| 
       3262 
3324 
     | 
    
         
             
                uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
         
     | 
| 
       3263 
3325 
     | 
    
         
             
                GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY);  // NOLINT
         
     | 
| 
       3264 
     | 
    
         
            -
                if 
     | 
| 
       3265 
     | 
    
         
            -
                    d_X = ctx->prealloc_qx;
         
     | 
| 
       3266 
     | 
    
         
            -
                } else if(!src0_uma) {
         
     | 
| 
      
 3326 
     | 
    
         
            +
                if(!src0_uma) {
         
     | 
| 
       3267 
3327 
     | 
    
         
             
                    d_X = extra_src0->buffer_gpu.lock();
         
     | 
| 
       3268 
3328 
     | 
    
         
             
                    x_buf_offset = extra_src0->offset;
         
     | 
| 
       3269 
3329 
     | 
    
         
             
                    GGML_ASSERT(d_X != nullptr);
         
     | 
| 
       3270 
3330 
     | 
    
         
             
                }
         
     | 
| 
       3271 
     | 
    
         
            -
                if ( 
     | 
| 
       3272 
     | 
    
         
            -
                    d_Y = ctx->prealloc_qy;
         
     | 
| 
       3273 
     | 
    
         
            -
                } else if (use_src1 && !src1_uma) {
         
     | 
| 
      
 3331 
     | 
    
         
            +
                if (use_src1 && !src1_uma) {
         
     | 
| 
       3274 
3332 
     | 
    
         
             
                    d_Y = extra_src1->buffer_gpu.lock();
         
     | 
| 
       3275 
3333 
     | 
    
         
             
                    y_buf_offset = extra_src1->offset;
         
     | 
| 
       3276 
3334 
     | 
    
         
             
                    GGML_ASSERT(d_Y != nullptr);
         
     | 
| 
       3277 
3335 
     | 
    
         
             
                }
         
     | 
| 
       3278 
3336 
     | 
    
         | 
| 
       3279 
     | 
    
         
            -
                GGML_ASSERT(!transfer_src2);
         
     | 
| 
       3280 
3337 
     | 
    
         
             
                if (use_src2 && !src2_uma) {
         
     | 
| 
       3281 
3338 
     | 
    
         
             
                    d_Z = extra_src2->buffer_gpu.lock();
         
     | 
| 
       3282 
3339 
     | 
    
         
             
                    z_buf_offset = extra_src2->offset;
         
     | 
| 
       3283 
3340 
     | 
    
         
             
                    GGML_ASSERT(d_Z != nullptr);
         
     | 
| 
       3284 
3341 
     | 
    
         
             
                }
         
     | 
| 
       3285 
3342 
     | 
    
         | 
| 
       3286 
     | 
    
         
            -
                if (op == GGML_OP_CPY) {
         
     | 
| 
       3287 
     | 
    
         
            -
                    GGML_ASSERT(!transfer_src0);
         
     | 
| 
       3288 
     | 
    
         
            -
                    GGML_ASSERT(!transfer_src1);
         
     | 
| 
      
 3343 
     | 
    
         
            +
                if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
         
     | 
| 
       3289 
3344 
     | 
    
         
             
                    x_sz = ggml_nbytes(src0);
         
     | 
| 
      
 3345 
     | 
    
         
            +
                    y_sz = use_src1 ? ggml_nbytes(src1) : 0;
         
     | 
| 
       3290 
3346 
     | 
    
         
             
                    d_sz = ggml_nbytes(dst);
         
     | 
| 
       3291 
3347 
     | 
    
         | 
| 
       3292 
     | 
    
         
            -
                    if ( 
     | 
| 
      
 3348 
     | 
    
         
            +
                    if (x_buf_offset + x_sz >= d_X->size) {
         
     | 
| 
       3293 
3349 
     | 
    
         
             
                        x_sz = VK_WHOLE_SIZE;
         
     | 
| 
       3294 
3350 
     | 
    
         
             
                    }
         
     | 
| 
       3295 
     | 
    
         
            -
                    if ( 
     | 
| 
      
 3351 
     | 
    
         
            +
                    if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
         
     | 
| 
      
 3352 
     | 
    
         
            +
                        y_sz = VK_WHOLE_SIZE;
         
     | 
| 
      
 3353 
     | 
    
         
            +
                    }
         
     | 
| 
      
 3354 
     | 
    
         
            +
                    if (d_buf_offset + d_sz >= d_D->size) {
         
     | 
| 
       3296 
3355 
     | 
    
         
             
                        d_sz = VK_WHOLE_SIZE;
         
     | 
| 
       3297 
3356 
     | 
    
         
             
                    }
         
     | 
| 
       3298 
3357 
     | 
    
         
             
                }
         
     | 
| 
       3299 
3358 
     | 
    
         | 
| 
       3300 
3359 
     | 
    
         
             
                std::array<uint32_t, 3> elements;
         
     | 
| 
       3301 
3360 
     | 
    
         | 
| 
       3302 
     | 
    
         
            -
                // copy src0 to device
         
     | 
| 
       3303 
     | 
    
         
            -
                if (transfer_src0) {
         
     | 
| 
       3304 
     | 
    
         
            -
                    ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
         
     | 
| 
       3305 
     | 
    
         
            -
                    ctx->staging_offset = x_sz * ne02 * ne03;
         
     | 
| 
       3306 
     | 
    
         
            -
                }
         
     | 
| 
       3307 
     | 
    
         
            -
                if (transfer_src1) {
         
     | 
| 
       3308 
     | 
    
         
            -
                    ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
         
     | 
| 
       3309 
     | 
    
         
            -
                }
         
     | 
| 
       3310 
     | 
    
         
            -
             
     | 
| 
       3311 
3361 
     | 
    
         
             
                // Single call if dimension 2 is contiguous
         
     | 
| 
       3312 
     | 
    
         
            -
                if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
         
     | 
| 
      
 3362 
     | 
    
         
            +
                if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
         
     | 
| 
       3313 
3363 
     | 
    
         
             
                    ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
         
     | 
| 
       3314 
3364 
     | 
    
         | 
| 
       3315 
3365 
     | 
    
         
             
                    switch (dst->op) {
         
     | 
| 
         @@ -3322,16 +3372,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c 
     | 
|
| 
       3322 
3372 
     | 
    
         
             
                    case GGML_OP_ROPE:
         
     | 
| 
       3323 
3373 
     | 
    
         
             
                        elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
         
     | 
| 
       3324 
3374 
     | 
    
         
             
                        break;
         
     | 
| 
      
 3375 
     | 
    
         
            +
                    case GGML_OP_GET_ROWS:
         
     | 
| 
      
 3376 
     | 
    
         
            +
                        elements = {  (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
         
     | 
| 
      
 3377 
     | 
    
         
            +
                        break;
         
     | 
| 
       3325 
3378 
     | 
    
         
             
                    default:
         
     | 
| 
       3326 
3379 
     | 
    
         
             
                        elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
         
     | 
| 
       3327 
3380 
     | 
    
         
             
                        break;
         
     | 
| 
       3328 
3381 
     | 
    
         
             
                    }
         
     | 
| 
       3329 
3382 
     | 
    
         | 
| 
       3330 
     | 
    
         
            -
                    if (op != GGML_OP_CPY) {
         
     | 
| 
      
 3383 
     | 
    
         
            +
                    if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
         
     | 
| 
       3331 
3384 
     | 
    
         
             
                        if (x_sz != VK_WHOLE_SIZE) {
         
     | 
| 
       3332 
3385 
     | 
    
         
             
                            x_sz *= ne02 * ne03;
         
     | 
| 
       3333 
3386 
     | 
    
         
             
                        }
         
     | 
| 
       3334 
     | 
    
         
            -
                        if (y_sz != VK_WHOLE_SIZE) {
         
     | 
| 
      
 3387 
     | 
    
         
            +
                        if (use_src1 && y_sz != VK_WHOLE_SIZE) {
         
     | 
| 
       3335 
3388 
     | 
    
         
             
                            y_sz *= ne12 * ne13;
         
     | 
| 
       3336 
3389 
     | 
    
         
             
                        }
         
     | 
| 
       3337 
3390 
     | 
    
         
             
                        if (d_sz != VK_WHOLE_SIZE) {
         
     | 
| 
         @@ -3386,6 +3439,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c 
     | 
|
| 
       3386 
3439 
     | 
    
         
             
                    case GGML_OP_ROPE:
         
     | 
| 
       3387 
3440 
     | 
    
         
             
                        elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
         
     | 
| 
       3388 
3441 
     | 
    
         
             
                        break;
         
     | 
| 
      
 3442 
     | 
    
         
            +
                    case GGML_OP_GET_ROWS:
         
     | 
| 
      
 3443 
     | 
    
         
            +
                        elements = {  (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
         
     | 
| 
      
 3444 
     | 
    
         
            +
                        break;
         
     | 
| 
       3389 
3445 
     | 
    
         
             
                    default:
         
     | 
| 
       3390 
3446 
     | 
    
         
             
                        elements = { (uint32_t)ne0, 1, 1 };
         
     | 
| 
       3391 
3447 
     | 
    
         
             
                        break;
         
     | 
| 
         @@ -3420,7 +3476,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c 
     | 
|
| 
       3420 
3476 
     | 
    
         
             
            }
         
     | 
| 
       3421 
3477 
     | 
    
         | 
| 
       3422 
3478 
     | 
    
         
             
            static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
         
     | 
| 
       3423 
     | 
    
         
            -
                 
     | 
| 
      
 3479 
     | 
    
         
            +
                const uint32_t src0_type_size = ggml_type_size(src0->type);
         
     | 
| 
      
 3480 
     | 
    
         
            +
                const uint32_t src1_type_size = ggml_type_size(src1->type);
         
     | 
| 
      
 3481 
     | 
    
         
            +
                const uint32_t dst_type_size = ggml_type_size(dst->type);
         
     | 
| 
      
 3482 
     | 
    
         
            +
             
     | 
| 
      
 3483 
     | 
    
         
            +
                ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
         
     | 
| 
      
 3484 
     | 
    
         
            +
                    (uint32_t)ggml_nelements(src0),
         
     | 
| 
      
 3485 
     | 
    
         
            +
                    (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
         
     | 
| 
      
 3486 
     | 
    
         
            +
                    (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
         
     | 
| 
      
 3487 
     | 
    
         
            +
                    (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
         
     | 
| 
      
 3488 
     | 
    
         
            +
                    0,
         
     | 
| 
      
 3489 
     | 
    
         
            +
                    0.0f, 0.0f,
         
     | 
| 
      
 3490 
     | 
    
         
            +
                });
         
     | 
| 
       3424 
3491 
     | 
    
         
             
            }
         
     | 
| 
       3425 
3492 
     | 
    
         | 
| 
       3426 
3493 
     | 
    
         
             
            static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
         
     | 
| 
         @@ -3576,9 +3643,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con 
     | 
|
| 
       3576 
3643 
     | 
    
         
             
                if (is_neox) {
         
     | 
| 
       3577 
3644 
     | 
    
         
             
                    const float theta_scale = powf(freq_base, -2.0f/n_dims);
         
     | 
| 
       3578 
3645 
     | 
    
         
             
                    const float inv_ndims = -1.0f / n_dims;
         
     | 
| 
       3579 
     | 
    
         
            -
                    ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
         
     | 
| 
      
 3646 
     | 
    
         
            +
                    ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
         
     | 
| 
       3580 
3647 
     | 
    
         
             
                } else {
         
     | 
| 
       3581 
     | 
    
         
            -
                    ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
         
     | 
| 
      
 3648 
     | 
    
         
            +
                    ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
         
     | 
| 
       3582 
3649 
     | 
    
         
             
                }
         
     | 
| 
       3583 
3650 
     | 
    
         
             
            }
         
     | 
| 
       3584 
3651 
     | 
    
         | 
| 
         @@ -3587,16 +3654,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, 
     | 
|
| 
       3587 
3654 
     | 
    
         
             
                ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
         
     | 
| 
       3588 
3655 
     | 
    
         
             
            }
         
     | 
| 
       3589 
3656 
     | 
    
         | 
| 
       3590 
     | 
    
         
            -
            static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
         
     | 
| 
       3591 
     | 
    
         
            -
                // If backend is CPU, data from src0 has to be copied off the device
         
     | 
| 
       3592 
     | 
    
         
            -
                if (dst->backend == GGML_BACKEND_TYPE_CPU) {
         
     | 
| 
       3593 
     | 
    
         
            -
                    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
         
     | 
| 
       3594 
     | 
    
         
            -
                    vk_buffer d_D = extra_src0->buffer_gpu.lock();
         
     | 
| 
       3595 
     | 
    
         
            -
                    ggml_vk_sync_buffers(subctx);
         
     | 
| 
       3596 
     | 
    
         
            -
                    ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size);
         
     | 
| 
       3597 
     | 
    
         
            -
                }
         
     | 
| 
       3598 
     | 
    
         
            -
            }
         
     | 
| 
       3599 
     | 
    
         
            -
             
     | 
| 
       3600 
3657 
     | 
    
         
             
            #ifdef GGML_VULKAN_RUN_TESTS
         
     | 
| 
       3601 
3658 
     | 
    
         
             
            static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
         
     | 
| 
       3602 
3659 
     | 
    
         
             
                if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
         
     | 
| 
         @@ -3619,6 +3676,8 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0 
     | 
|
| 
       3619 
3676 
     | 
    
         
             
                                val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
         
     | 
| 
       3620 
3677 
     | 
    
         
             
                            } else if (type == GGML_TYPE_F16) {
         
     | 
| 
       3621 
3678 
     | 
    
         
             
                                val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
         
     | 
| 
      
 3679 
     | 
    
         
            +
                            } else {
         
     | 
| 
      
 3680 
     | 
    
         
            +
                                GGML_ASSERT(false);
         
     | 
| 
       3622 
3681 
     | 
    
         
             
                            }
         
     | 
| 
       3623 
3682 
     | 
    
         
             
                            fprintf(stderr, "% 7.2f ", val);
         
     | 
| 
       3624 
3683 
     | 
    
         
             
                        } else {
         
     | 
| 
         @@ -3920,6 +3979,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1 
     | 
|
| 
       3920 
3979 
     | 
    
         
             
                                val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
         
     | 
| 
       3921 
3980 
     | 
    
         
             
                            } else if (tensor->type == GGML_TYPE_F16) {
         
     | 
| 
       3922 
3981 
     | 
    
         
             
                                val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
         
     | 
| 
      
 3982 
     | 
    
         
            +
                            } else {
         
     | 
| 
      
 3983 
     | 
    
         
            +
                                GGML_ASSERT(false);
         
     | 
| 
       3923 
3984 
     | 
    
         
             
                            }
         
     | 
| 
       3924 
3985 
     | 
    
         
             
                            fprintf(stderr, "% 7.2f ", val);
         
     | 
| 
       3925 
3986 
     | 
    
         
             
                        } else {
         
     | 
| 
         @@ -4335,7 +4396,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, 
     | 
|
| 
       4335 
4396 
     | 
    
         | 
| 
       4336 
4397 
     | 
    
         
             
                std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
         
     | 
| 
       4337 
4398 
     | 
    
         | 
| 
       4338 
     | 
    
         
            -
                if (avg_err > 0. 
     | 
| 
      
 4399 
     | 
    
         
            +
                if (avg_err > 0.01 || std::isnan(avg_err)) {
         
     | 
| 
       4339 
4400 
     | 
    
         
             
                    std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
         
     | 
| 
       4340 
4401 
     | 
    
         
             
                    std::cerr << "Actual result: " << std::endl << std::endl;
         
     | 
| 
       4341 
4402 
     | 
    
         
             
                    ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
         
     | 
| 
         @@ -4385,27 +4446,15 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) 
     | 
|
| 
       4385 
4446 
     | 
    
         
             
                return extra;
         
     | 
| 
       4386 
4447 
     | 
    
         
             
            }
         
     | 
| 
       4387 
4448 
     | 
    
         | 
| 
       4388 
     | 
    
         
            -
            static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) {
         
     | 
| 
       4389 
     | 
    
         
            -
                return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID;
         
     | 
| 
       4390 
     | 
    
         
            -
            }
         
     | 
| 
       4391 
     | 
    
         
            -
             
     | 
| 
       4392 
4449 
     | 
    
         
             
            static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
         
     | 
| 
       4393 
4450 
     | 
    
         
             
            #ifdef GGML_VULKAN_DEBUG
         
     | 
| 
       4394 
4451 
     | 
    
         
             
                std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
         
     | 
| 
       4395 
4452 
     | 
    
         
             
            #endif
         
     | 
| 
       4396 
     | 
    
         
            -
                 
     | 
| 
       4397 
     | 
    
         
            -
                    || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
         
     | 
| 
       4398 
     | 
    
         
            -
                    || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
         
     | 
| 
       4399 
     | 
    
         
            -
             
     | 
| 
       4400 
     | 
    
         
            -
                if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
         
     | 
| 
      
 4453 
     | 
    
         
            +
                if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
         
     | 
| 
       4401 
4454 
     | 
    
         
             
                    return;
         
     | 
| 
       4402 
4455 
     | 
    
         
             
                }
         
     | 
| 
       4403 
4456 
     | 
    
         | 
| 
       4404 
4457 
     | 
    
         
             
                ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
         
     | 
| 
       4405 
     | 
    
         
            -
                if (extra == nullptr) {
         
     | 
| 
       4406 
     | 
    
         
            -
                    // Workaround for CPU backend BLAS matmul calls
         
     | 
| 
       4407 
     | 
    
         
            -
                    extra = ggml_vk_tensor_create_extra(node);
         
     | 
| 
       4408 
     | 
    
         
            -
                }
         
     | 
| 
       4409 
4458 
     | 
    
         | 
| 
       4410 
4459 
     | 
    
         
             
                ggml_tensor * src0 = node->src[0];
         
     | 
| 
       4411 
4460 
     | 
    
         
             
                ggml_tensor * src1 = node->src[1];
         
     | 
| 
         @@ -4425,7 +4474,18 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm 
     | 
|
| 
       4425 
4474 
     | 
    
         
             
                const int64_t ne22 = node->ne[2];
         
     | 
| 
       4426 
4475 
     | 
    
         
             
                const int64_t ne23 = node->ne[3];
         
     | 
| 
       4427 
4476 
     | 
    
         | 
| 
       4428 
     | 
    
         
            -
                const  
     | 
| 
      
 4477 
     | 
    
         
            +
                const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
         
     | 
| 
      
 4478 
     | 
    
         
            +
                const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
         
     | 
| 
      
 4479 
     | 
    
         
            +
             
     | 
| 
      
 4480 
     | 
    
         
            +
                const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
         
     | 
| 
      
 4481 
     | 
    
         
            +
                const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
         
     | 
| 
      
 4482 
     | 
    
         
            +
             
     | 
| 
      
 4483 
     | 
    
         
            +
                const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
         
     | 
| 
      
 4484 
     | 
    
         
            +
             
     | 
| 
      
 4485 
     | 
    
         
            +
                bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
         
     | 
| 
      
 4486 
     | 
    
         
            +
             
     | 
| 
      
 4487 
     | 
    
         
            +
                const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
         
     | 
| 
      
 4488 
     | 
    
         
            +
                const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
         
     | 
| 
       4429 
4489 
     | 
    
         | 
| 
       4430 
4490 
     | 
    
         
             
                int split_k;
         
     | 
| 
       4431 
4491 
     | 
    
         
             
                if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
         
     | 
| 
         @@ -4437,10 +4497,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm 
     | 
|
| 
       4437 
4497 
     | 
    
         
             
                const uint32_t y_ne = ne10 * ne11;
         
     | 
| 
       4438 
4498 
     | 
    
         
             
                const uint32_t d_ne = ne20 * ne21;
         
     | 
| 
       4439 
4499 
     | 
    
         | 
| 
       4440 
     | 
    
         
            -
                const uint64_t  
     | 
| 
       4441 
     | 
    
         
            -
                const uint64_t  
     | 
| 
       4442 
     | 
    
         
            -
                const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
         
     | 
| 
       4443 
     | 
    
         
            -
                const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
         
     | 
| 
      
 4500 
     | 
    
         
            +
                const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
         
     | 
| 
      
 4501 
     | 
    
         
            +
                const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
         
     | 
| 
       4444 
4502 
     | 
    
         
             
                uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
         
     | 
| 
       4445 
4503 
     | 
    
         
             
                const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
         
     | 
| 
       4446 
4504 
     | 
    
         | 
| 
         @@ -4483,12 +4541,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm 
     | 
|
| 
       4483 
4541 
     | 
    
         
             
                    break;
         
     | 
| 
       4484 
4542 
     | 
    
         
             
                case GGML_OP_MUL_MAT:
         
     | 
| 
       4485 
4543 
     | 
    
         
             
                case GGML_OP_MUL_MAT_ID:
         
     | 
| 
       4486 
     | 
    
         
            -
                    if (ctx->prealloc_size_qx < qx_sz) {
         
     | 
| 
       4487 
     | 
    
         
            -
                        ctx->prealloc_size_qx = qx_sz;
         
     | 
| 
       4488 
     | 
    
         
            -
                    }
         
     | 
| 
       4489 
     | 
    
         
            -
                    if (ctx->prealloc_size_qy < qy_sz) {
         
     | 
| 
       4490 
     | 
    
         
            -
                        ctx->prealloc_size_qy = qy_sz;
         
     | 
| 
       4491 
     | 
    
         
            -
                    }
         
     | 
| 
       4492 
4544 
     | 
    
         
             
                    if (ctx->prealloc_size_x < x_sz) {
         
     | 
| 
       4493 
4545 
     | 
    
         
             
                        ctx->prealloc_size_x = x_sz;
         
     | 
| 
       4494 
4546 
     | 
    
         
             
                    }
         
     | 
| 
         @@ -4512,7 +4564,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { 
     | 
|
| 
       4512 
4564 
     | 
    
         
             
                    return;
         
     | 
| 
       4513 
4565 
     | 
    
         
             
                }
         
     | 
| 
       4514 
4566 
     | 
    
         
             
            #ifdef GGML_VULKAN_DEBUG
         
     | 
| 
       4515 
     | 
    
         
            -
                std::cerr << "ggml_vk_preallocate_buffers( 
     | 
| 
      
 4567 
     | 
    
         
            +
                std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
         
     | 
| 
       4516 
4568 
     | 
    
         
             
            #endif
         
     | 
| 
       4517 
4569 
     | 
    
         
             
            #if defined(GGML_VULKAN_RUN_TESTS)
         
     | 
| 
       4518 
4570 
     | 
    
         
             
                ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
         
     | 
| 
         @@ -4575,6 +4627,41 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { 
     | 
|
| 
       4575 
4627 
     | 
    
         
             
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
         
     | 
| 
       4576 
4628 
     | 
    
         
             
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
         
     | 
| 
       4577 
4629 
     | 
    
         | 
| 
      
 4630 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
         
     | 
| 
      
 4631 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
         
     | 
| 
      
 4632 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
         
     | 
| 
      
 4633 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
         
     | 
| 
      
 4634 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
         
     | 
| 
      
 4635 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
         
     | 
| 
      
 4636 
     | 
    
         
            +
             
     | 
| 
      
 4637 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
         
     | 
| 
      
 4638 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
         
     | 
| 
      
 4639 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
         
     | 
| 
      
 4640 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
         
     | 
| 
      
 4641 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
         
     | 
| 
      
 4642 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
         
     | 
| 
      
 4643 
     | 
    
         
            +
             
     | 
| 
      
 4644 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
         
     | 
| 
      
 4645 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
         
     | 
| 
      
 4646 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
         
     | 
| 
      
 4647 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
         
     | 
| 
      
 4648 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
         
     | 
| 
      
 4649 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
         
     | 
| 
      
 4650 
     | 
    
         
            +
             
     | 
| 
      
 4651 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
         
     | 
| 
      
 4652 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
         
     | 
| 
      
 4653 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
         
     | 
| 
      
 4654 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
         
     | 
| 
      
 4655 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
         
     | 
| 
      
 4656 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
         
     | 
| 
      
 4657 
     | 
    
         
            +
             
     | 
| 
      
 4658 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
         
     | 
| 
      
 4659 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
         
     | 
| 
      
 4660 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
         
     | 
| 
      
 4661 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
         
     | 
| 
      
 4662 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
         
     | 
| 
      
 4663 
     | 
    
         
            +
                ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
         
     | 
| 
      
 4664 
     | 
    
         
            +
             
     | 
| 
       4578 
4665 
     | 
    
         
             
                std::cerr << std::endl;
         
     | 
| 
       4579 
4666 
     | 
    
         | 
| 
       4580 
4667 
     | 
    
         
             
                const std::vector<size_t> vals {
         
     | 
| 
         @@ -4614,20 +4701,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { 
     | 
|
| 
       4614 
4701 
     | 
    
         
             
                GGML_ASSERT(false);
         
     | 
| 
       4615 
4702 
     | 
    
         
             
            #endif
         
     | 
| 
       4616 
4703 
     | 
    
         | 
| 
       4617 
     | 
    
         
            -
                if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) {
         
     | 
| 
       4618 
     | 
    
         
            -
                    // Resize buffer
         
     | 
| 
       4619 
     | 
    
         
            -
                    if (ctx->prealloc_qx != nullptr) {
         
     | 
| 
       4620 
     | 
    
         
            -
                        ggml_vk_destroy_buffer(ctx->prealloc_qx);
         
     | 
| 
       4621 
     | 
    
         
            -
                    }
         
     | 
| 
       4622 
     | 
    
         
            -
                    ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx);
         
     | 
| 
       4623 
     | 
    
         
            -
                }
         
     | 
| 
       4624 
     | 
    
         
            -
                if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) {
         
     | 
| 
       4625 
     | 
    
         
            -
                    // Resize buffer
         
     | 
| 
       4626 
     | 
    
         
            -
                    if (ctx->prealloc_qy != nullptr) {
         
     | 
| 
       4627 
     | 
    
         
            -
                        ggml_vk_destroy_buffer(ctx->prealloc_qy);
         
     | 
| 
       4628 
     | 
    
         
            -
                    }
         
     | 
| 
       4629 
     | 
    
         
            -
                    ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy);
         
     | 
| 
       4630 
     | 
    
         
            -
                }
         
     | 
| 
       4631 
4704 
     | 
    
         
             
                if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
         
     | 
| 
       4632 
4705 
     | 
    
         
             
                    // Resize buffer
         
     | 
| 
       4633 
4706 
     | 
    
         
             
                    if (ctx->prealloc_x != nullptr) {
         
     | 
| 
         @@ -4661,11 +4734,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { 
     | 
|
| 
       4661 
4734 
     | 
    
         
             
            }
         
     | 
| 
       4662 
4735 
     | 
    
         | 
| 
       4663 
4736 
     | 
    
         
             
            static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
         
     | 
| 
       4664 
     | 
    
         
            -
                 
     | 
| 
       4665 
     | 
    
         
            -
                    || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
         
     | 
| 
       4666 
     | 
    
         
            -
                    || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
         
     | 
| 
       4667 
     | 
    
         
            -
             
     | 
| 
       4668 
     | 
    
         
            -
                if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
         
     | 
| 
      
 4737 
     | 
    
         
            +
                if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
         
     | 
| 
       4669 
4738 
     | 
    
         
             
                    return;
         
     | 
| 
       4670 
4739 
     | 
    
         
             
                }
         
     | 
| 
       4671 
4740 
     | 
    
         | 
| 
         @@ -4693,7 +4762,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod 
     | 
|
| 
       4693 
4762 
     | 
    
         
             
                    }
         
     | 
| 
       4694 
4763 
     | 
    
         
             
                    break;
         
     | 
| 
       4695 
4764 
     | 
    
         
             
                case GGML_OP_REPEAT:
         
     | 
| 
       4696 
     | 
    
         
            -
                 
     | 
| 
      
 4765 
     | 
    
         
            +
                case GGML_OP_GET_ROWS:
         
     | 
| 
       4697 
4766 
     | 
    
         
             
                case GGML_OP_ADD:
         
     | 
| 
       4698 
4767 
     | 
    
         
             
                case GGML_OP_MUL:
         
     | 
| 
       4699 
4768 
     | 
    
         
             
                case GGML_OP_SCALE:
         
     | 
| 
         @@ -4717,10 +4786,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod 
     | 
|
| 
       4717 
4786 
     | 
    
         
             
                case GGML_OP_ARGSORT:
         
     | 
| 
       4718 
4787 
     | 
    
         
             
                    break;
         
     | 
| 
       4719 
4788 
     | 
    
         
             
                default:
         
     | 
| 
       4720 
     | 
    
         
            -
                     
     | 
| 
       4721 
     | 
    
         
            -
             
     | 
| 
       4722 
     | 
    
         
            -
                        GGML_ASSERT(false);
         
     | 
| 
       4723 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4789 
     | 
    
         
            +
                    std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
         
     | 
| 
      
 4790 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
       4724 
4791 
     | 
    
         
             
                    return;
         
     | 
| 
       4725 
4792 
     | 
    
         
             
                }
         
     | 
| 
       4726 
4793 
     | 
    
         | 
| 
         @@ -4769,8 +4836,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod 
     | 
|
| 
       4769 
4836 
     | 
    
         
             
                case GGML_OP_PERMUTE:
         
     | 
| 
       4770 
4837 
     | 
    
         
             
                case GGML_OP_TRANSPOSE:
         
     | 
| 
       4771 
4838 
     | 
    
         
             
                case GGML_OP_NONE:
         
     | 
| 
       4772 
     | 
    
         
            -
                    ggml_vk_nop(ctx, ctx->compute_ctx, src0, node);
         
     | 
| 
       4773 
     | 
    
         
            -
             
     | 
| 
       4774 
4839 
     | 
    
         
             
                    break;
         
     | 
| 
       4775 
4840 
     | 
    
         
             
                case GGML_OP_NORM:
         
     | 
| 
       4776 
4841 
     | 
    
         
             
                    ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
         
     | 
| 
         @@ -4837,11 +4902,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod 
     | 
|
| 
       4837 
4902 
     | 
    
         
             
            }
         
     | 
| 
       4838 
4903 
     | 
    
         | 
| 
       4839 
4904 
     | 
    
         
             
            static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
         
     | 
| 
       4840 
     | 
    
         
            -
                 
     | 
| 
       4841 
     | 
    
         
            -
                    || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
         
     | 
| 
       4842 
     | 
    
         
            -
                    || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
         
     | 
| 
       4843 
     | 
    
         
            -
             
     | 
| 
       4844 
     | 
    
         
            -
                if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
         
     | 
| 
      
 4905 
     | 
    
         
            +
                if (ctx->disable) {
         
     | 
| 
       4845 
4906 
     | 
    
         
             
                    return false;
         
     | 
| 
       4846 
4907 
     | 
    
         
             
                }
         
     | 
| 
       4847 
4908 
     | 
    
         | 
| 
         @@ -4884,10 +4945,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_ 
     | 
|
| 
       4884 
4945 
     | 
    
         
             
                    break;
         
     | 
| 
       4885 
4946 
     | 
    
         
             
                case GGML_OP_MUL_MAT:
         
     | 
| 
       4886 
4947 
     | 
    
         
             
                case GGML_OP_MUL_MAT_ID:
         
     | 
| 
       4887 
     | 
    
         
            -
                    if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
         
     | 
| 
       4888 
     | 
    
         
            -
                        return false;
         
     | 
| 
       4889 
     | 
    
         
            -
                    }
         
     | 
| 
       4890 
     | 
    
         
            -
             
     | 
| 
       4891 
4948 
     | 
    
         
             
                    extra = (ggml_tensor_extra_gpu *) tensor->extra;
         
     | 
| 
       4892 
4949 
     | 
    
         | 
| 
       4893 
4950 
     | 
    
         
             
                    break;
         
     | 
| 
         @@ -5001,8 +5058,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { 
     | 
|
| 
       5001 
5058 
     | 
    
         
             
            #endif
         
     | 
| 
       5002 
5059 
     | 
    
         
             
                ggml_vk_graph_cleanup(ctx);
         
     | 
| 
       5003 
5060 
     | 
    
         | 
| 
       5004 
     | 
    
         
            -
                ggml_vk_destroy_buffer(ctx->prealloc_qx);
         
     | 
| 
       5005 
     | 
    
         
            -
                ggml_vk_destroy_buffer(ctx->prealloc_qy);
         
     | 
| 
       5006 
5061 
     | 
    
         
             
                ggml_vk_destroy_buffer(ctx->prealloc_x);
         
     | 
| 
       5007 
5062 
     | 
    
         
             
                ggml_vk_destroy_buffer(ctx->prealloc_y);
         
     | 
| 
       5008 
5063 
     | 
    
         
             
                ggml_vk_destroy_buffer(ctx->prealloc_split_k);
         
     | 
| 
         @@ -5013,8 +5068,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { 
     | 
|
| 
       5013 
5068 
     | 
    
         
             
                    ggml_vk_destroy_buffer(buffer);
         
     | 
| 
       5014 
5069 
     | 
    
         
             
                }
         
     | 
| 
       5015 
5070 
     | 
    
         | 
| 
       5016 
     | 
    
         
            -
                ctx->prealloc_size_qx = 0;
         
     | 
| 
       5017 
     | 
    
         
            -
                ctx->prealloc_size_qy = 0;
         
     | 
| 
       5018 
5071 
     | 
    
         
             
                ctx->prealloc_size_x = 0;
         
     | 
| 
       5019 
5072 
     | 
    
         
             
                ctx->prealloc_size_y = 0;
         
     | 
| 
       5020 
5073 
     | 
    
         
             
                ctx->prealloc_size_split_k = 0;
         
     | 
| 
         @@ -5045,80 +5098,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript 
     | 
|
| 
       5045 
5098 
     | 
    
         
             
                snprintf(description, description_size, "%s", props.deviceName.data());
         
     | 
| 
       5046 
5099 
     | 
    
         
             
            }
         
     | 
| 
       5047 
5100 
     | 
    
         | 
| 
       5048 
     | 
    
         
            -
            // CPU assist interface
         
     | 
| 
       5049 
     | 
    
         
            -
             
     | 
| 
       5050 
     | 
    
         
            -
            void ggml_vk_init_cpu_assist() {
         
     | 
| 
       5051 
     | 
    
         
            -
                ggml_vk_instance_init();
         
     | 
| 
       5052 
     | 
    
         
            -
             
     | 
| 
       5053 
     | 
    
         
            -
                std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
         
     | 
| 
       5054 
     | 
    
         
            -
             
     | 
| 
       5055 
     | 
    
         
            -
                for (int i = 0; i < ggml_vk_get_device_count(); i++) {
         
     | 
| 
       5056 
     | 
    
         
            -
                    ggml_vk_print_gpu_info(i);
         
     | 
| 
       5057 
     | 
    
         
            -
                }
         
     | 
| 
       5058 
     | 
    
         
            -
                // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
         
     | 
| 
       5059 
     | 
    
         
            -
                ggml_backend_vk_init(0);
         
     | 
| 
       5060 
     | 
    
         
            -
            }
         
     | 
| 
       5061 
     | 
    
         
            -
             
     | 
| 
       5062 
     | 
    
         
            -
            void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
         
     | 
| 
       5063 
     | 
    
         
            -
                ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
         
     | 
| 
       5064 
     | 
    
         
            -
             
     | 
| 
       5065 
     | 
    
         
            -
                if (!ctx->initialized) {
         
     | 
| 
       5066 
     | 
    
         
            -
                    return;
         
     | 
| 
       5067 
     | 
    
         
            -
                }
         
     | 
| 
       5068 
     | 
    
         
            -
             
     | 
| 
       5069 
     | 
    
         
            -
                ggml_vk_preallocate_buffers_graph(ctx, node);
         
     | 
| 
       5070 
     | 
    
         
            -
            }
         
     | 
| 
       5071 
     | 
    
         
            -
             
     | 
| 
       5072 
     | 
    
         
            -
            void ggml_vk_preallocate_buffers_cpu_assist() {
         
     | 
| 
       5073 
     | 
    
         
            -
                ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
         
     | 
| 
       5074 
     | 
    
         
            -
             
     | 
| 
       5075 
     | 
    
         
            -
                if (!ctx->initialized) {
         
     | 
| 
       5076 
     | 
    
         
            -
                    return;
         
     | 
| 
       5077 
     | 
    
         
            -
                }
         
     | 
| 
       5078 
     | 
    
         
            -
             
     | 
| 
       5079 
     | 
    
         
            -
                ggml_vk_preallocate_buffers(ctx);
         
     | 
| 
       5080 
     | 
    
         
            -
            }
         
     | 
| 
       5081 
     | 
    
         
            -
             
     | 
| 
       5082 
     | 
    
         
            -
            void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
         
     | 
| 
       5083 
     | 
    
         
            -
                ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
         
     | 
| 
       5084 
     | 
    
         
            -
             
     | 
| 
       5085 
     | 
    
         
            -
                if (!ctx->initialized) {
         
     | 
| 
       5086 
     | 
    
         
            -
                    return;
         
     | 
| 
       5087 
     | 
    
         
            -
                }
         
     | 
| 
       5088 
     | 
    
         
            -
             
     | 
| 
       5089 
     | 
    
         
            -
                ggml_vk_build_graph(ctx, node, last_node);
         
     | 
| 
       5090 
     | 
    
         
            -
            }
         
     | 
| 
       5091 
     | 
    
         
            -
             
     | 
| 
       5092 
     | 
    
         
            -
            bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
         
     | 
| 
       5093 
     | 
    
         
            -
                ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
         
     | 
| 
       5094 
     | 
    
         
            -
             
     | 
| 
       5095 
     | 
    
         
            -
                if (!ctx->initialized) {
         
     | 
| 
       5096 
     | 
    
         
            -
                    return false;
         
     | 
| 
       5097 
     | 
    
         
            -
                }
         
     | 
| 
       5098 
     | 
    
         
            -
             
     | 
| 
       5099 
     | 
    
         
            -
                return ggml_vk_compute_forward(ctx, params, tensor);
         
     | 
| 
       5100 
     | 
    
         
            -
            }
         
     | 
| 
       5101 
     | 
    
         
            -
             
     | 
| 
       5102 
     | 
    
         
            -
            void ggml_vk_graph_cleanup_cpu_assist() {
         
     | 
| 
       5103 
     | 
    
         
            -
                ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
         
     | 
| 
       5104 
     | 
    
         
            -
             
     | 
| 
       5105 
     | 
    
         
            -
                if (!ctx->initialized) {
         
     | 
| 
       5106 
     | 
    
         
            -
                    return;
         
     | 
| 
       5107 
     | 
    
         
            -
                }
         
     | 
| 
       5108 
     | 
    
         
            -
             
     | 
| 
       5109 
     | 
    
         
            -
                ggml_vk_graph_cleanup(ctx);
         
     | 
| 
       5110 
     | 
    
         
            -
            }
         
     | 
| 
       5111 
     | 
    
         
            -
             
     | 
| 
       5112 
     | 
    
         
            -
            void ggml_vk_free_cpu_assist() {
         
     | 
| 
       5113 
     | 
    
         
            -
                ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
         
     | 
| 
       5114 
     | 
    
         
            -
             
     | 
| 
       5115 
     | 
    
         
            -
                if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
         
     | 
| 
       5116 
     | 
    
         
            -
                    return;
         
     | 
| 
       5117 
     | 
    
         
            -
                }
         
     | 
| 
       5118 
     | 
    
         
            -
             
     | 
| 
       5119 
     | 
    
         
            -
                ggml_backend_vk_free(vk_instance.backends[0]);
         
     | 
| 
       5120 
     | 
    
         
            -
            }
         
     | 
| 
       5121 
     | 
    
         
            -
             
     | 
| 
       5122 
5101 
     | 
    
         
             
            // backend interface
         
     | 
| 
       5123 
5102 
     | 
    
         | 
| 
       5124 
5103 
     | 
    
         
             
            #define UNUSED GGML_UNUSED
         
     | 
| 
         @@ -5330,16 +5309,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { 
     | 
|
| 
       5330 
5309 
     | 
    
         
             
                /* .is_host          = */ NULL,
         
     | 
| 
       5331 
5310 
     | 
    
         
             
            };
         
     | 
| 
       5332 
5311 
     | 
    
         | 
| 
       5333 
     | 
    
         
            -
            GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t  
     | 
| 
      
 5312 
     | 
    
         
            +
            GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
         
     | 
| 
       5334 
5313 
     | 
    
         
             
            #ifdef GGML_VULKAN_DEBUG
         
     | 
| 
       5335 
     | 
    
         
            -
                std::cerr << "ggml_backend_vk_buffer_type(" <<  
     | 
| 
      
 5314 
     | 
    
         
            +
                std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
         
     | 
| 
       5336 
5315 
     | 
    
         
             
            #endif
         
     | 
| 
       5337 
5316 
     | 
    
         | 
| 
       5338 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
      
 5317 
     | 
    
         
            +
                GGML_ASSERT(dev_num < vk_instance.device_indices.size());
         
     | 
| 
       5339 
5318 
     | 
    
         | 
| 
       5340 
     | 
    
         
            -
                ggml_backend_vk_init( 
     | 
| 
      
 5319 
     | 
    
         
            +
                ggml_backend_vk_init(dev_num);
         
     | 
| 
       5341 
5320 
     | 
    
         | 
| 
       5342 
     | 
    
         
            -
                return &vk_instance.buffer_types[ 
     | 
| 
      
 5321 
     | 
    
         
            +
                return &vk_instance.buffer_types[dev_num];
         
     | 
| 
       5343 
5322 
     | 
    
         
             
            }
         
     | 
| 
       5344 
5323 
     | 
    
         | 
| 
       5345 
5324 
     | 
    
         
             
            // host buffer type
         
     | 
| 
         @@ -5508,7 +5487,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c 
     | 
|
| 
       5508 
5487 
     | 
    
         
             
                    vk_buffer src_buf = src_extra->buffer_gpu.lock();
         
     | 
| 
       5509 
5488 
     | 
    
         
             
                    vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
         
     | 
| 
       5510 
5489 
     | 
    
         | 
| 
       5511 
     | 
    
         
            -
                    ggml_vk_buffer_copy_async(ctx->transfer_ctx,  
     | 
| 
      
 5490 
     | 
    
         
            +
                    ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
         
     | 
| 
       5512 
5491 
     | 
    
         
             
                    return true;
         
     | 
| 
       5513 
5492 
     | 
    
         
             
                }
         
     | 
| 
       5514 
5493 
     | 
    
         | 
| 
         @@ -5542,6 +5521,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { 
     | 
|
| 
       5542 
5521 
     | 
    
         
             
            }
         
     | 
| 
       5543 
5522 
     | 
    
         | 
| 
       5544 
5523 
     | 
    
         
             
            GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
         
     | 
| 
      
 5524 
     | 
    
         
            +
            #ifdef GGML_VULKAN_DEBUG
         
     | 
| 
      
 5525 
     | 
    
         
            +
                std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
         
     | 
| 
      
 5526 
     | 
    
         
            +
            #endif
         
     | 
| 
       5545 
5527 
     | 
    
         
             
                ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
         
     | 
| 
       5546 
5528 
     | 
    
         | 
| 
       5547 
5529 
     | 
    
         
             
                for (int i = 0; i < cgraph->n_nodes; i++) {
         
     | 
| 
         @@ -5566,7 +5548,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen 
     | 
|
| 
       5566 
5548 
     | 
    
         
             
                for (int i = 0; i < cgraph->n_nodes; i++) {
         
     | 
| 
       5567 
5549 
     | 
    
         
             
                    ggml_tensor * node = cgraph->nodes[i];
         
     | 
| 
       5568 
5550 
     | 
    
         | 
| 
       5569 
     | 
    
         
            -
                    if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
         
     | 
| 
      
 5551 
     | 
    
         
            +
                    if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
         
     | 
| 
       5570 
5552 
     | 
    
         
             
                        continue;
         
     | 
| 
       5571 
5553 
     | 
    
         
             
                    }
         
     | 
| 
       5572 
5554 
     | 
    
         | 
| 
         @@ -5602,8 +5584,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const 
     | 
|
| 
       5602 
5584 
     | 
    
         
             
                        }
         
     | 
| 
       5603 
5585 
     | 
    
         
             
                        break;
         
     | 
| 
       5604 
5586 
     | 
    
         
             
                    case GGML_OP_MUL_MAT:
         
     | 
| 
       5605 
     | 
    
         
            -
                    case GGML_OP_MUL_MAT_ID:
         
     | 
| 
      
 5587 
     | 
    
         
            +
                    // case GGML_OP_MUL_MAT_ID:
         
     | 
| 
       5606 
5588 
     | 
    
         
             
                        {
         
     | 
| 
      
 5589 
     | 
    
         
            +
                            switch (op->src[0]->type) {
         
     | 
| 
      
 5590 
     | 
    
         
            +
                                case GGML_TYPE_F32:
         
     | 
| 
      
 5591 
     | 
    
         
            +
                                case GGML_TYPE_F16:
         
     | 
| 
      
 5592 
     | 
    
         
            +
                                case GGML_TYPE_Q4_0:
         
     | 
| 
      
 5593 
     | 
    
         
            +
                                case GGML_TYPE_Q4_1:
         
     | 
| 
      
 5594 
     | 
    
         
            +
                                case GGML_TYPE_Q5_0:
         
     | 
| 
      
 5595 
     | 
    
         
            +
                                case GGML_TYPE_Q5_1:
         
     | 
| 
      
 5596 
     | 
    
         
            +
                                case GGML_TYPE_Q8_0:
         
     | 
| 
      
 5597 
     | 
    
         
            +
                                case GGML_TYPE_Q2_K:
         
     | 
| 
      
 5598 
     | 
    
         
            +
                                case GGML_TYPE_Q3_K:
         
     | 
| 
      
 5599 
     | 
    
         
            +
                                case GGML_TYPE_Q4_K:
         
     | 
| 
      
 5600 
     | 
    
         
            +
                                case GGML_TYPE_Q5_K:
         
     | 
| 
      
 5601 
     | 
    
         
            +
                                case GGML_TYPE_Q6_K:
         
     | 
| 
      
 5602 
     | 
    
         
            +
                                    break;
         
     | 
| 
      
 5603 
     | 
    
         
            +
                                default:
         
     | 
| 
      
 5604 
     | 
    
         
            +
                                    return false;
         
     | 
| 
      
 5605 
     | 
    
         
            +
                            }
         
     | 
| 
       5607 
5606 
     | 
    
         
             
                            struct ggml_tensor * a;
         
     | 
| 
       5608 
5607 
     | 
    
         
             
                            struct ggml_tensor * b;
         
     | 
| 
       5609 
5608 
     | 
    
         
             
                            if (op->op == GGML_OP_MUL_MAT) {
         
     | 
| 
         @@ -5618,25 +5617,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const 
     | 
|
| 
       5618 
5617 
     | 
    
         
             
                            }
         
     | 
| 
       5619 
5618 
     | 
    
         
             
                            return true;
         
     | 
| 
       5620 
5619 
     | 
    
         
             
                        } break;
         
     | 
| 
       5621 
     | 
    
         
            -
                     
     | 
| 
       5622 
     | 
    
         
            -
             
     | 
| 
       5623 
     | 
    
         
            -
             
     | 
| 
       5624 
     | 
    
         
            -
             
     | 
| 
       5625 
     | 
    
         
            -
             
     | 
| 
       5626 
     | 
    
         
            -
             
     | 
| 
       5627 
     | 
    
         
            -
             
     | 
| 
       5628 
     | 
    
         
            -
             
     | 
| 
       5629 
     | 
    
         
            -
             
     | 
| 
       5630 
     | 
    
         
            -
             
     | 
| 
       5631 
     | 
    
         
            -
             
     | 
| 
       5632 
     | 
    
         
            -
             
     | 
| 
       5633 
     | 
    
         
            -
             
     | 
| 
       5634 
     | 
    
         
            -
             
     | 
| 
       5635 
     | 
    
         
            -
             
     | 
| 
      
 5620 
     | 
    
         
            +
                    case GGML_OP_GET_ROWS:
         
     | 
| 
      
 5621 
     | 
    
         
            +
                        {
         
     | 
| 
      
 5622 
     | 
    
         
            +
                            switch (op->src[0]->type) {
         
     | 
| 
      
 5623 
     | 
    
         
            +
                                case GGML_TYPE_F32:
         
     | 
| 
      
 5624 
     | 
    
         
            +
                                case GGML_TYPE_F16:
         
     | 
| 
      
 5625 
     | 
    
         
            +
                                case GGML_TYPE_Q4_0:
         
     | 
| 
      
 5626 
     | 
    
         
            +
                                case GGML_TYPE_Q4_1:
         
     | 
| 
      
 5627 
     | 
    
         
            +
                                case GGML_TYPE_Q5_0:
         
     | 
| 
      
 5628 
     | 
    
         
            +
                                case GGML_TYPE_Q5_1:
         
     | 
| 
      
 5629 
     | 
    
         
            +
                                case GGML_TYPE_Q8_0:
         
     | 
| 
      
 5630 
     | 
    
         
            +
                                    return true;
         
     | 
| 
      
 5631 
     | 
    
         
            +
                                default:
         
     | 
| 
      
 5632 
     | 
    
         
            +
                                    return false;
         
     | 
| 
      
 5633 
     | 
    
         
            +
                            }
         
     | 
| 
      
 5634 
     | 
    
         
            +
                        } break;
         
     | 
| 
       5636 
5635 
     | 
    
         
             
                    case GGML_OP_CPY:
         
     | 
| 
      
 5636 
     | 
    
         
            +
                    case GGML_OP_DUP:
         
     | 
| 
       5637 
5637 
     | 
    
         
             
                        {
         
     | 
| 
       5638 
5638 
     | 
    
         
             
                            ggml_type src0_type = op->src[0]->type;
         
     | 
| 
       5639 
     | 
    
         
            -
                            ggml_type src1_type = op->src[1]->type;
         
     | 
| 
      
 5639 
     | 
    
         
            +
                            ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
         
     | 
| 
       5640 
5640 
     | 
    
         
             
                            if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
         
     | 
| 
       5641 
5641 
     | 
    
         
             
                                return true;
         
     | 
| 
       5642 
5642 
     | 
    
         
             
                            }
         
     | 
| 
         @@ -5648,7 +5648,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const 
     | 
|
| 
       5648 
5648 
     | 
    
         
             
                            }
         
     | 
| 
       5649 
5649 
     | 
    
         
             
                            return false;
         
     | 
| 
       5650 
5650 
     | 
    
         
             
                        } break;
         
     | 
| 
       5651 
     | 
    
         
            -
                    case GGML_OP_DUP:
         
     | 
| 
       5652 
5651 
     | 
    
         
             
                    // case GGML_OP_REPEAT:
         
     | 
| 
       5653 
5652 
     | 
    
         
             
                    //     {
         
     | 
| 
       5654 
5653 
     | 
    
         
             
                    //         ggml_type src0_type = op->src[0]->type;
         
     | 
| 
         @@ -5685,6 +5684,20 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const 
     | 
|
| 
       5685 
5684 
     | 
    
         
             
                UNUSED(backend);
         
     | 
| 
       5686 
5685 
     | 
    
         
             
            }
         
     | 
| 
       5687 
5686 
     | 
    
         | 
| 
      
 5687 
     | 
    
         
            +
            GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
         
     | 
| 
      
 5688 
     | 
    
         
            +
                const ggml_tensor * dst = op;
         
     | 
| 
      
 5689 
     | 
    
         
            +
             
     | 
| 
      
 5690 
     | 
    
         
            +
                const int min_batch_size = 32;
         
     | 
| 
      
 5691 
     | 
    
         
            +
             
     | 
| 
      
 5692 
     | 
    
         
            +
                if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
         
     | 
| 
      
 5693 
     | 
    
         
            +
                    return true;
         
     | 
| 
      
 5694 
     | 
    
         
            +
                }
         
     | 
| 
      
 5695 
     | 
    
         
            +
             
     | 
| 
      
 5696 
     | 
    
         
            +
                return false;
         
     | 
| 
      
 5697 
     | 
    
         
            +
             
     | 
| 
      
 5698 
     | 
    
         
            +
                UNUSED(backend);
         
     | 
| 
      
 5699 
     | 
    
         
            +
            }
         
     | 
| 
      
 5700 
     | 
    
         
            +
             
     | 
| 
       5688 
5701 
     | 
    
         
             
            // TODO: enable async and synchronize
         
     | 
| 
       5689 
5702 
     | 
    
         
             
            static ggml_backend_i ggml_backend_vk_interface = {
         
     | 
| 
       5690 
5703 
     | 
    
         
             
                /* .get_name                = */ ggml_backend_vk_name,
         
     | 
| 
         @@ -5699,7 +5712,7 @@ static ggml_backend_i ggml_backend_vk_interface = { 
     | 
|
| 
       5699 
5712 
     | 
    
         
             
                /* .graph_plan_compute      = */ NULL,
         
     | 
| 
       5700 
5713 
     | 
    
         
             
                /* .graph_compute           = */ ggml_backend_vk_graph_compute,
         
     | 
| 
       5701 
5714 
     | 
    
         
             
                /* .supports_op             = */ ggml_backend_vk_supports_op,
         
     | 
| 
       5702 
     | 
    
         
            -
                /* .offload_op              = */  
     | 
| 
      
 5715 
     | 
    
         
            +
                /* .offload_op              = */ ggml_backend_vk_offload_op,
         
     | 
| 
       5703 
5716 
     | 
    
         
             
                /* .event_new               = */ NULL,
         
     | 
| 
       5704 
5717 
     | 
    
         
             
                /* .event_free              = */ NULL,
         
     | 
| 
       5705 
5718 
     | 
    
         
             
                /* .event_record            = */ NULL,
         
     | 
| 
         @@ -5712,22 +5725,22 @@ static ggml_guid_t ggml_backend_vk_guid() { 
     | 
|
| 
       5712 
5725 
     | 
    
         
             
                return &guid;
         
     | 
| 
       5713 
5726 
     | 
    
         
             
            }
         
     | 
| 
       5714 
5727 
     | 
    
         | 
| 
       5715 
     | 
    
         
            -
            GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t  
     | 
| 
       5716 
     | 
    
         
            -
                if (vk_instance.initialized[ 
     | 
| 
       5717 
     | 
    
         
            -
                    return vk_instance.backends[ 
     | 
| 
      
 5728 
     | 
    
         
            +
            GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
         
     | 
| 
      
 5729 
     | 
    
         
            +
                if (vk_instance.initialized[dev_num]) {
         
     | 
| 
      
 5730 
     | 
    
         
            +
                    return vk_instance.backends[dev_num];
         
     | 
| 
       5718 
5731 
     | 
    
         
             
                }
         
     | 
| 
       5719 
5732 
     | 
    
         
             
            #ifdef GGML_VULKAN_DEBUG
         
     | 
| 
       5720 
     | 
    
         
            -
                std::cerr << "ggml_backend_vk_init(" <<  
     | 
| 
      
 5733 
     | 
    
         
            +
                std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
         
     | 
| 
       5721 
5734 
     | 
    
         
             
            #endif
         
     | 
| 
       5722 
5735 
     | 
    
         | 
| 
       5723 
     | 
    
         
            -
                ggml_backend_vk_context * ctx = &vk_instance.contexts[ 
     | 
| 
       5724 
     | 
    
         
            -
                ggml_vk_init(ctx,  
     | 
| 
       5725 
     | 
    
         
            -
                ctx->name = GGML_VK_NAME + std::to_string( 
     | 
| 
       5726 
     | 
    
         
            -
                vk_instance.buffer_types[ 
     | 
| 
      
 5736 
     | 
    
         
            +
                ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
         
     | 
| 
      
 5737 
     | 
    
         
            +
                ggml_vk_init(ctx, dev_num);
         
     | 
| 
      
 5738 
     | 
    
         
            +
                ctx->name = GGML_VK_NAME + std::to_string(dev_num);
         
     | 
| 
      
 5739 
     | 
    
         
            +
                vk_instance.buffer_types[dev_num] = {
         
     | 
| 
       5727 
5740 
     | 
    
         
             
                    /* .iface    = */ ggml_backend_vk_buffer_type_interface,
         
     | 
| 
       5728 
5741 
     | 
    
         
             
                    /* .context  = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
         
     | 
| 
       5729 
5742 
     | 
    
         
             
                };
         
     | 
| 
       5730 
     | 
    
         
            -
                vk_instance.initialized[ 
     | 
| 
      
 5743 
     | 
    
         
            +
                vk_instance.initialized[dev_num] = true;
         
     | 
| 
       5731 
5744 
     | 
    
         | 
| 
       5732 
5745 
     | 
    
         
             
                ggml_backend_t vk_backend = new ggml_backend {
         
     | 
| 
       5733 
5746 
     | 
    
         
             
                    /* .guid      = */ ggml_backend_vk_guid(),
         
     | 
| 
         @@ -5735,7 +5748,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) { 
     | 
|
| 
       5735 
5748 
     | 
    
         
             
                    /* .context   = */ &vk_instance.contexts[ctx->idx],
         
     | 
| 
       5736 
5749 
     | 
    
         
             
                };
         
     | 
| 
       5737 
5750 
     | 
    
         | 
| 
       5738 
     | 
    
         
            -
                vk_instance.backends[ 
     | 
| 
      
 5751 
     | 
    
         
            +
                vk_instance.backends[dev_num] = vk_backend;
         
     | 
| 
       5739 
5752 
     | 
    
         | 
| 
       5740 
5753 
     | 
    
         
             
                return vk_backend;
         
     | 
| 
       5741 
5754 
     | 
    
         
             
            }
         
     | 
| 
         @@ -5779,10 +5792,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo 
     | 
|
| 
       5779 
5792 
     | 
    
         
             
            extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
         
     | 
| 
       5780 
5793 
     | 
    
         | 
| 
       5781 
5794 
     | 
    
         
             
            GGML_CALL int ggml_backend_vk_reg_devices() {
         
     | 
| 
       5782 
     | 
    
         
            -
                 
     | 
| 
      
 5795 
     | 
    
         
            +
                ggml_vk_instance_init();
         
     | 
| 
      
 5796 
     | 
    
         
            +
             
     | 
| 
      
 5797 
     | 
    
         
            +
                for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
         
     | 
| 
       5783 
5798 
     | 
    
         
             
                    char name[128];
         
     | 
| 
       5784 
     | 
    
         
            -
                    snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME,  
     | 
| 
       5785 
     | 
    
         
            -
                    ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type( 
     | 
| 
      
 5799 
     | 
    
         
            +
                    snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
         
     | 
| 
      
 5800 
     | 
    
         
            +
                    ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i);  // NOLINT
         
     | 
| 
       5786 
5801 
     | 
    
         
             
                }
         
     | 
| 
       5787 
5802 
     | 
    
         
             
                return vk_instance.device_indices.size();
         
     | 
| 
       5788 
5803 
     | 
    
         
             
            }
         
     | 
| 
         @@ -5866,6 +5881,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d 
     | 
|
| 
       5866 
5881 
     | 
    
         
             
                                val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
         
     | 
| 
       5867 
5882 
     | 
    
         
             
                            } else if (tensor->type == GGML_TYPE_F16) {
         
     | 
| 
       5868 
5883 
     | 
    
         
             
                                val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
         
     | 
| 
      
 5884 
     | 
    
         
            +
                            } else {
         
     | 
| 
      
 5885 
     | 
    
         
            +
                                GGML_ASSERT(false);
         
     | 
| 
       5869 
5886 
     | 
    
         
             
                            }
         
     | 
| 
       5870 
5887 
     | 
    
         
             
                            fprintf(stderr, "% 7.2f ", val);
         
     | 
| 
       5871 
5888 
     | 
    
         
             
                        } else {
         
     | 
| 
         @@ -5960,6 +5977,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ 
     | 
|
| 
       5960 
5977 
     | 
    
         
             
                    return;
         
     | 
| 
       5961 
5978 
     | 
    
         
             
                }
         
     | 
| 
       5962 
5979 
     | 
    
         | 
| 
      
 5980 
     | 
    
         
            +
            #ifdef GGML_VULKAN_DEBUG
         
     | 
| 
      
 5981 
     | 
    
         
            +
                std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
         
     | 
| 
      
 5982 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 5983 
     | 
    
         
            +
             
     | 
| 
       5963 
5984 
     | 
    
         
             
                ggml_tensor * src0 = tensor->src[0];
         
     | 
| 
       5964 
5985 
     | 
    
         
             
                ggml_tensor * src1 = tensor->src[1];
         
     | 
| 
       5965 
5986 
     | 
    
         
             
                ggml_tensor * src2 = tensor->src[2];
         
     | 
| 
         @@ -6219,6 +6240,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ 
     | 
|
| 
       6219 
6240 
     | 
    
         
             
                    tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
         
     | 
| 
       6220 
6241 
     | 
    
         
             
                } else if (tensor->op == GGML_OP_TRANSPOSE) {
         
     | 
| 
       6221 
6242 
     | 
    
         
             
                    tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
         
     | 
| 
      
 6243 
     | 
    
         
            +
                } else if (tensor->op == GGML_OP_GET_ROWS) {
         
     | 
| 
      
 6244 
     | 
    
         
            +
                    tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
         
     | 
| 
       6222 
6245 
     | 
    
         
             
                } else {
         
     | 
| 
       6223 
6246 
     | 
    
         
             
                    std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
         
     | 
| 
       6224 
6247 
     | 
    
         
             
                    GGML_ASSERT(false);
         
     | 
| 
         @@ -6269,6 +6292,10 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ 
     | 
|
| 
       6269 
6292 
     | 
    
         
             
                    return;
         
     | 
| 
       6270 
6293 
     | 
    
         
             
                }
         
     | 
| 
       6271 
6294 
     | 
    
         | 
| 
      
 6295 
     | 
    
         
            +
            #ifdef GGML_VULKAN_DEBUG
         
     | 
| 
      
 6296 
     | 
    
         
            +
                std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
         
     | 
| 
      
 6297 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 6298 
     | 
    
         
            +
             
     | 
| 
       6272 
6299 
     | 
    
         
             
                ggml_tensor * src0 = tensor->src[0];
         
     | 
| 
       6273 
6300 
     | 
    
         
             
                ggml_tensor * src1 = tensor->src[1];
         
     | 
| 
       6274 
6301 
     | 
    
         | 
| 
         @@ -6412,10 +6439,4 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ 
     | 
|
| 
       6412 
6439 
     | 
    
         
             
                    free(tensor_data);
         
     | 
| 
       6413 
6440 
     | 
    
         
             
                }
         
     | 
| 
       6414 
6441 
     | 
    
         
             
            }
         
     | 
| 
       6415 
     | 
    
         
            -
             
     | 
| 
       6416 
     | 
    
         
            -
            void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
         
     | 
| 
       6417 
     | 
    
         
            -
                ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
         
     | 
| 
       6418 
     | 
    
         
            -
             
     | 
| 
       6419 
     | 
    
         
            -
                ggml_vk_check_results_0(ctx, params, tensor);
         
     | 
| 
       6420 
     | 
    
         
            -
            }
         
     | 
| 
       6421 
6442 
     | 
    
         
             
            #endif
         
     |