llama_cpp 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
| @@ -63,7 +63,9 @@ struct ggml_metal_context { | |
| 63 63 | 
             
                GGML_METAL_DECL_KERNEL(relu);
         | 
| 64 64 | 
             
                GGML_METAL_DECL_KERNEL(gelu);
         | 
| 65 65 | 
             
                GGML_METAL_DECL_KERNEL(soft_max);
         | 
| 66 | 
            +
                GGML_METAL_DECL_KERNEL(soft_max_4);
         | 
| 66 67 | 
             
                GGML_METAL_DECL_KERNEL(diag_mask_inf);
         | 
| 68 | 
            +
                GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
         | 
| 67 69 | 
             
                GGML_METAL_DECL_KERNEL(get_rows_f16);
         | 
| 68 70 | 
             
                GGML_METAL_DECL_KERNEL(get_rows_q4_0);
         | 
| 69 71 | 
             
                GGML_METAL_DECL_KERNEL(get_rows_q4_1);
         | 
| @@ -76,6 +78,8 @@ struct ggml_metal_context { | |
| 76 78 | 
             
                GGML_METAL_DECL_KERNEL(rms_norm);
         | 
| 77 79 | 
             
                GGML_METAL_DECL_KERNEL(norm);
         | 
| 78 80 | 
             
                GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
         | 
| 81 | 
            +
                GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
         | 
| 82 | 
            +
                GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
         | 
| 79 83 | 
             
                GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
         | 
| 80 84 | 
             
                GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
         | 
| 81 85 | 
             
                GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
         | 
| @@ -116,22 +120,47 @@ static NSString * const msl_library_source = @"see metal.metal"; | |
| 116 120 | 
             
            struct ggml_metal_context * ggml_metal_init(int n_cb) {
         | 
| 117 121 | 
             
                metal_printf("%s: allocating\n", __func__);
         | 
| 118 122 |  | 
| 119 | 
            -
                 | 
| 123 | 
            +
                id <MTLDevice> device;
         | 
| 124 | 
            +
                NSString * s;
         | 
| 125 | 
            +
             | 
| 126 | 
            +
            #if TARGET_OS_OSX
         | 
| 127 | 
            +
                // Show all the Metal device instances in the system
         | 
| 128 | 
            +
                NSArray * devices = MTLCopyAllDevices();
         | 
| 129 | 
            +
                for (device in devices) {
         | 
| 130 | 
            +
                    s = [device name];
         | 
| 131 | 
            +
                    metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
         | 
| 132 | 
            +
                }
         | 
| 133 | 
            +
            #endif
         | 
| 120 134 |  | 
| 135 | 
            +
                // Pick and show default Metal device
         | 
| 136 | 
            +
                device = MTLCreateSystemDefaultDevice();
         | 
| 137 | 
            +
                s = [device name];
         | 
| 138 | 
            +
                metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                // Configure context
         | 
| 141 | 
            +
                struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
         | 
| 142 | 
            +
                ctx->device = device;
         | 
| 121 143 | 
             
                ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
         | 
| 122 | 
            -
                ctx->device = MTLCreateSystemDefaultDevice();
         | 
| 123 144 | 
             
                ctx->queue  = [ctx->device newCommandQueue];
         | 
| 124 145 | 
             
                ctx->n_buffers = 0;
         | 
| 125 146 | 
             
                ctx->concur_list_len = 0;
         | 
| 126 147 |  | 
| 127 148 | 
             
                ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
         | 
| 128 149 |  | 
| 129 | 
            -
            # | 
| 130 | 
            -
                //  | 
| 150 | 
            +
            #ifdef GGML_SWIFT
         | 
| 151 | 
            +
                // load the default.metallib file
         | 
| 131 152 | 
             
                {
         | 
| 132 153 | 
             
                    NSError * error = nil;
         | 
| 133 154 |  | 
| 134 | 
            -
                     | 
| 155 | 
            +
                    NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
         | 
| 156 | 
            +
                    NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
         | 
| 157 | 
            +
                    NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
         | 
| 158 | 
            +
                    NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
         | 
| 159 | 
            +
                    NSURL * libURL = [NSURL fileURLWithPath:libPath];
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                    // Load the metallib file into a Metal library
         | 
| 162 | 
            +
                    ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
         | 
| 163 | 
            +
             | 
| 135 164 | 
             
                    if (error) {
         | 
| 136 165 | 
             
                        metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
         | 
| 137 166 | 
             
                        return NULL;
         | 
| @@ -192,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { | |
| 192 221 | 
             
                    GGML_METAL_ADD_KERNEL(relu);
         | 
| 193 222 | 
             
                    GGML_METAL_ADD_KERNEL(gelu);
         | 
| 194 223 | 
             
                    GGML_METAL_ADD_KERNEL(soft_max);
         | 
| 224 | 
            +
                    GGML_METAL_ADD_KERNEL(soft_max_4);
         | 
| 195 225 | 
             
                    GGML_METAL_ADD_KERNEL(diag_mask_inf);
         | 
| 226 | 
            +
                    GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
         | 
| 196 227 | 
             
                    GGML_METAL_ADD_KERNEL(get_rows_f16);
         | 
| 197 228 | 
             
                    GGML_METAL_ADD_KERNEL(get_rows_q4_0);
         | 
| 198 229 | 
             
                    GGML_METAL_ADD_KERNEL(get_rows_q4_1);
         | 
| @@ -205,6 +236,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { | |
| 205 236 | 
             
                    GGML_METAL_ADD_KERNEL(rms_norm);
         | 
| 206 237 | 
             
                    GGML_METAL_ADD_KERNEL(norm);
         | 
| 207 238 | 
             
                    GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
         | 
| 239 | 
            +
                    GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
         | 
| 240 | 
            +
                    GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
         | 
| 208 241 | 
             
                    GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
         | 
| 209 242 | 
             
                    GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
         | 
| 210 243 | 
             
                    GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
         | 
| @@ -231,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { | |
| 231 264 | 
             
            #undef GGML_METAL_ADD_KERNEL
         | 
| 232 265 | 
             
                }
         | 
| 233 266 |  | 
| 234 | 
            -
                metal_printf("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
         | 
| 235 267 | 
             
                metal_printf("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
         | 
| 268 | 
            +
            #if TARGET_OS_OSX
         | 
| 269 | 
            +
                metal_printf("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
         | 
| 236 270 | 
             
                if (ctx->device.maxTransferRate != 0) {
         | 
| 237 271 | 
             
                    metal_printf("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
         | 
| 238 272 | 
             
                } else {
         | 
| 239 273 | 
             
                    metal_printf("%s: maxTransferRate               = built-in GPU\n", __func__);
         | 
| 240 274 | 
             
                }
         | 
| 275 | 
            +
            #endif
         | 
| 241 276 |  | 
| 242 277 | 
             
                return ctx;
         | 
| 243 278 | 
             
            }
         | 
| @@ -257,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { | |
| 257 292 | 
             
                GGML_METAL_DEL_KERNEL(relu);
         | 
| 258 293 | 
             
                GGML_METAL_DEL_KERNEL(gelu);
         | 
| 259 294 | 
             
                GGML_METAL_DEL_KERNEL(soft_max);
         | 
| 260 | 
            -
                GGML_METAL_DEL_KERNEL( | 
| 295 | 
            +
                GGML_METAL_DEL_KERNEL(soft_max_4);
         | 
| 296 | 
            +
                GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
         | 
| 261 297 | 
             
                GGML_METAL_DEL_KERNEL(get_rows_f16);
         | 
| 262 298 | 
             
                GGML_METAL_DEL_KERNEL(get_rows_q4_0);
         | 
| 263 299 | 
             
                GGML_METAL_DEL_KERNEL(get_rows_q4_1);
         | 
| @@ -270,6 +306,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { | |
| 270 306 | 
             
                GGML_METAL_DEL_KERNEL(rms_norm);
         | 
| 271 307 | 
             
                GGML_METAL_DEL_KERNEL(norm);
         | 
| 272 308 | 
             
                GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
         | 
| 309 | 
            +
                GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
         | 
| 310 | 
            +
                GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
         | 
| 273 311 | 
             
                GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
         | 
| 274 312 | 
             
                GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
         | 
| 275 313 | 
             
                GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
         | 
| @@ -310,7 +348,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { | |
| 310 348 |  | 
| 311 349 | 
             
            void * ggml_metal_host_malloc(size_t n) {
         | 
| 312 350 | 
             
                void * data = NULL;
         | 
| 313 | 
            -
                const int result = posix_memalign((void **) &data,  | 
| 351 | 
            +
                const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
         | 
| 314 352 | 
             
                if (result != 0) {
         | 
| 315 353 | 
             
                    metal_printf("%s: error: posix_memalign failed\n", __func__);
         | 
| 316 354 | 
             
                    return NULL;
         | 
| @@ -384,7 +422,7 @@ bool ggml_metal_add_buffer( | |
| 384 422 | 
             
                        }
         | 
| 385 423 | 
             
                    }
         | 
| 386 424 |  | 
| 387 | 
            -
                    const size_t size_page =  | 
| 425 | 
            +
                    const size_t size_page = sysconf(_SC_PAGESIZE);
         | 
| 388 426 |  | 
| 389 427 | 
             
                    size_t size_aligned = size;
         | 
| 390 428 | 
             
                    if ((size_aligned % size_page) != 0) {
         | 
| @@ -437,6 +475,7 @@ bool ggml_metal_add_buffer( | |
| 437 475 | 
             
                        }
         | 
| 438 476 | 
             
                    }
         | 
| 439 477 |  | 
| 478 | 
            +
            #if TARGET_OS_OSX
         | 
| 440 479 | 
             
                    metal_printf(", (%8.2f / %8.2f)",
         | 
| 441 480 | 
             
                            ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
         | 
| 442 481 | 
             
                            ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
         | 
| @@ -446,6 +485,9 @@ bool ggml_metal_add_buffer( | |
| 446 485 | 
             
                    } else {
         | 
| 447 486 | 
             
                        metal_printf("\n");
         | 
| 448 487 | 
             
                    }
         | 
| 488 | 
            +
            #else
         | 
| 489 | 
            +
                    metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
         | 
| 490 | 
            +
            #endif
         | 
| 449 491 | 
             
                }
         | 
| 450 492 |  | 
| 451 493 | 
             
                return true;
         | 
| @@ -733,7 +775,7 @@ void ggml_metal_graph_compute( | |
| 733 775 | 
             
                                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
         | 
| 734 776 | 
             
                                        [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
         | 
| 735 777 |  | 
| 736 | 
            -
                                        const int64_t n = ggml_nelements(dst);
         | 
| 778 | 
            +
                                        const int64_t n = ggml_nelements(dst)/4;
         | 
| 737 779 |  | 
| 738 780 | 
             
                                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
         | 
| 739 781 | 
             
                                    } break;
         | 
| @@ -745,7 +787,7 @@ void ggml_metal_graph_compute( | |
| 745 787 | 
             
                                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
         | 
| 746 788 | 
             
                                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
         | 
| 747 789 |  | 
| 748 | 
            -
                                                const int64_t n = ggml_nelements(dst);
         | 
| 790 | 
            +
                                                const int64_t n = ggml_nelements(dst)/4;
         | 
| 749 791 |  | 
| 750 792 | 
             
                                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
         | 
| 751 793 | 
             
                                            } break;
         | 
| @@ -765,7 +807,7 @@ void ggml_metal_graph_compute( | |
| 765 807 | 
             
                                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
         | 
| 766 808 | 
             
                                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
         | 
| 767 809 |  | 
| 768 | 
            -
                                                const int64_t n = ggml_nelements(dst);
         | 
| 810 | 
            +
                                                const int64_t n = ggml_nelements(dst)/4;
         | 
| 769 811 |  | 
| 770 812 | 
             
                                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
         | 
| 771 813 | 
             
                                            } break;
         | 
| @@ -779,13 +821,16 @@ void ggml_metal_graph_compute( | |
| 779 821 | 
             
                                    {
         | 
| 780 822 | 
             
                                        const int nth = 32;
         | 
| 781 823 |  | 
| 782 | 
            -
                                         | 
| 824 | 
            +
                                        if (ne00%4 == 0) {
         | 
| 825 | 
            +
                                            [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
         | 
| 826 | 
            +
                                        } else {
         | 
| 827 | 
            +
                                            [encoder setComputePipelineState:ctx->pipeline_soft_max];
         | 
| 828 | 
            +
                                        }
         | 
| 783 829 | 
             
                                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
         | 
| 784 830 | 
             
                                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
         | 
| 785 831 | 
             
                                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
         | 
| 786 832 | 
             
                                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
         | 
| 787 833 | 
             
                                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
         | 
| 788 | 
            -
                                        [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
         | 
| 789 834 |  | 
| 790 835 | 
             
                                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
         | 
| 791 836 | 
             
                                    } break;
         | 
| @@ -793,14 +838,23 @@ void ggml_metal_graph_compute( | |
| 793 838 | 
             
                                    {
         | 
| 794 839 | 
             
                                        const int n_past = ((int32_t *)(dst->op_params))[0];
         | 
| 795 840 |  | 
| 796 | 
            -
                                         | 
| 841 | 
            +
                                        if (ne00%8 == 0) {
         | 
| 842 | 
            +
                                            [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
         | 
| 843 | 
            +
                                        } else {
         | 
| 844 | 
            +
                                            [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
         | 
| 845 | 
            +
                                        }
         | 
| 797 846 | 
             
                                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
         | 
| 798 847 | 
             
                                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
         | 
| 799 848 | 
             
                                        [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
         | 
| 800 849 | 
             
                                        [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
         | 
| 801 850 | 
             
                                        [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
         | 
| 802 851 |  | 
| 803 | 
            -
                                         | 
| 852 | 
            +
                                        if (ne00%8 == 0) {
         | 
| 853 | 
            +
                                            [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
         | 
| 854 | 
            +
                                        }
         | 
| 855 | 
            +
                                        else {
         | 
| 856 | 
            +
                                            [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
         | 
| 857 | 
            +
                                        }
         | 
| 804 858 | 
             
                                    } break;
         | 
| 805 859 | 
             
                                case GGML_OP_MUL_MAT:
         | 
| 806 860 | 
             
                                    {
         | 
| @@ -847,6 +901,7 @@ void ggml_metal_graph_compute( | |
| 847 901 | 
             
                                        } else {
         | 
| 848 902 | 
             
                                            int nth0 = 32;
         | 
| 849 903 | 
             
                                            int nth1 = 1;
         | 
| 904 | 
            +
                                            int nrows = 1;
         | 
| 850 905 |  | 
| 851 906 | 
             
                                            // use custom matrix x vector kernel
         | 
| 852 907 | 
             
                                            switch (src0t) {
         | 
| @@ -854,7 +909,15 @@ void ggml_metal_graph_compute( | |
| 854 909 | 
             
                                                    {
         | 
| 855 910 | 
             
                                                        nth0 = 32;
         | 
| 856 911 | 
             
                                                        nth1 = 1;
         | 
| 857 | 
            -
                                                         | 
| 912 | 
            +
                                                        if (ne11 * ne12 < 4) {
         | 
| 913 | 
            +
                                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
         | 
| 914 | 
            +
                                                        } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
         | 
| 915 | 
            +
                                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
         | 
| 916 | 
            +
                                                            nrows = ne11;
         | 
| 917 | 
            +
                                                        } else {
         | 
| 918 | 
            +
                                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
         | 
| 919 | 
            +
                                                            nrows = 4;
         | 
| 920 | 
            +
                                                        }
         | 
| 858 921 | 
             
                                                    } break;
         | 
| 859 922 | 
             
                                                case GGML_TYPE_Q4_0:
         | 
| 860 923 | 
             
                                                    {
         | 
| @@ -906,8 +969,8 @@ void ggml_metal_graph_compute( | |
| 906 969 | 
             
                                                        GGML_ASSERT(ne02 == 1);
         | 
| 907 970 | 
             
                                                        GGML_ASSERT(ne12 == 1);
         | 
| 908 971 |  | 
| 909 | 
            -
                                                        nth0 =  | 
| 910 | 
            -
                                                        nth1 = 32;
         | 
| 972 | 
            +
                                                        nth0 = 4; //1;
         | 
| 973 | 
            +
                                                        nth1 = 8; //32;
         | 
| 911 974 | 
             
                                                        [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
         | 
| 912 975 | 
             
                                                    } break;
         | 
| 913 976 | 
             
                                                case GGML_TYPE_Q5_K:
         | 
| @@ -955,9 +1018,12 @@ void ggml_metal_graph_compute( | |
| 955 1018 | 
             
                                            [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
         | 
| 956 1019 |  | 
| 957 1020 | 
             
                                            if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
         | 
| 958 | 
            -
                                                src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
         | 
| 1021 | 
            +
                                                src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
         | 
| 959 1022 | 
             
                                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
         | 
| 960 1023 | 
             
                                            }
         | 
| 1024 | 
            +
                                            else if (src0t == GGML_TYPE_Q4_K) {
         | 
| 1025 | 
            +
                                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
         | 
| 1026 | 
            +
                                            }
         | 
| 961 1027 | 
             
                                            else if (src0t == GGML_TYPE_Q3_K) {
         | 
| 962 1028 | 
             
            #ifdef GGML_QKK_64
         | 
| 963 1029 | 
             
                                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
         | 
| @@ -971,8 +1037,8 @@ void ggml_metal_graph_compute( | |
| 971 1037 | 
             
                                            else if (src0t == GGML_TYPE_Q6_K) {
         | 
| 972 1038 | 
             
                                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
         | 
| 973 1039 | 
             
                                            } else {
         | 
| 974 | 
            -
                                                 | 
| 975 | 
            -
                                                [encoder dispatchThreadgroups:MTLSizeMake(ne01,  | 
| 1040 | 
            +
                                                int64_t ny = (ne11 + nrows - 1)/nrows;
         | 
| 1041 | 
            +
                                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
         | 
| 976 1042 | 
             
                                            }
         | 
| 977 1043 | 
             
                                        }
         | 
| 978 1044 | 
             
                                    } break;
         | 
| @@ -1117,7 +1183,7 @@ void ggml_metal_graph_compute( | |
| 1117 1183 | 
             
                                        [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
         | 
| 1118 1184 | 
             
                                        [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
         | 
| 1119 1185 |  | 
| 1120 | 
            -
                                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake( | 
| 1186 | 
            +
                                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
         | 
| 1121 1187 | 
             
                                    } break;
         | 
| 1122 1188 | 
             
                                case GGML_OP_DUP:
         | 
| 1123 1189 | 
             
                                case GGML_OP_CPY:
         |