RubyGems - llama_cpp - Versions diffs - 0.5.0 → 0.5.1 - Mend

llama_cpp 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/ext/llama_cpp/src/ggml-alloc.c +106 -24
data/ext/llama_cpp/src/ggml-cuda.cu +83 -23
data/ext/llama_cpp/src/ggml-metal.m +35 -11
data/ext/llama_cpp/src/ggml-metal.metal +145 -92
data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
data/ext/llama_cpp/src/ggml.c +25 -53
data/ext/llama_cpp/src/k_quants.c +45 -12
data/ext/llama_cpp/src/llama.cpp +146 -70
data/ext/llama_cpp/src/llama.h +3 -0
data/lib/llama_cpp/version.rb +2 -2
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 715eab98a76ed825d66da6e4fcc84154dca8eed76f6cf6625d210a1ffb702958
-  data.tar.gz: 3ceafc312354d245e485b664d71450cd9c27bcd89f5faec91af6cdf1221c251f
+  metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
+  data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
 SHA512:
-  metadata.gz: 7ebe959d9380c9d981156606fdd8a6bcea9b88914923e693b400cfcd605b8c216bdfdcc807c0e72a21fe5fc6d7d623118fc7246524d7f59acdb8bc0064d736bc
-  data.tar.gz: c6d428234d866c09d227b5c308a573e9721454ded3f7fdd36880706e7c47c72c67e6fed119c75d6898c6a1149cde853e5dbb59e3a390ef3d370aab4f0d6be548
+  metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
+  data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,7 @@
+## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
+- Bump bundled llama.cpp from master-b1140 to master-b1198.
 ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
 **Breaking Changes**

data/ext/llama_cpp/src/ggml-alloc.c CHANGED Viewed

@@ -1,3 +1,8 @@
+// defines MAP_ANONYMOUS
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
 #include "ggml-alloc.h"
 #include "ggml.h"
 #include <assert.h>
@@ -6,6 +11,26 @@
 #include <stdlib.h>
 #include <string.h>
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/types.h>
+            #include <sys/mman.h>
+        #endif
+    #endif
+#endif
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #include <memoryapi.h>
+#endif
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@@ -99,19 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif
-static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     return ggml_nbytes(tensor);
     UNUSED(alloc);
 }
+// check if a tensor is allocated by this buffer
+static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
+    void * ptr = tensor->data;
+    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
+}
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 #ifdef GGML_ALLOCATOR_DEBUG
-    GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
+    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
     GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
 #endif
-    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -135,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
     if (best_fit_block == -1) {
         // the last block is our last resort
         struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        max_avail = MAX(max_avail, block->size);
         if (block->size >= size) {
             best_fit_block = alloc->n_free_blocks - 1;
-            max_avail = MAX(max_avail, block->size);
         } else {
             fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
                     __func__, size, max_avail);
             GGML_ASSERT(!"not enough space in the buffer");
-        return;
+            return;
         }
     }
     struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -177,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 }
 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     void * ptr = tensor->data;
-    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
+    if (ggml_allocr_is_own(alloc, tensor) == false) {
         // the tensor was not allocated in this buffer
         // this can happen because the graph allocator will try to free weights and other tensors from different buffers
         // the easiest way to deal with this is just to ignore it
         return;
     }
-    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
@@ -281,17 +311,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
     return alloc;
 }
-// address and size of the buffer when measuring
-// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
-static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
-static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
+// OS specific functions to allocate and free uncommitted virtual memory
+static void * alloc_vmem(size_t size) {
+#if defined(_WIN32)
+    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
+#elif defined(_POSIX_MAPPED_FILES)
+    void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    if (ptr == MAP_FAILED) {
+        return NULL;
+    }
+    return ptr;
+#else
+    // use a fixed address for other platforms
+    uintptr_t base_addr = (uintptr_t)-size - 0x100;
+    return (void *)base_addr;
+#endif
+}
+static void free_vmem(void * base_addr, size_t size) {
+#if defined(_WIN32)
+    VirtualFree(base_addr, 0, MEM_RELEASE);
+    UNUSED(size);
+#elif defined(_POSIX_MAPPED_FILES)
+    munmap(base_addr, size);
+#else
+    // nothing to do
+    UNUSED(base_addr);
+    UNUSED(size);
+#endif
+}
+// allocate uncommitted virtual memory to measure the size of the graph
+static void alloc_measure_vmem(void ** base_addr, size_t * size) {
+    // 1TB for 64-bit, 1GB for 32-bit
+    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
+    do {
+        *base_addr = alloc_vmem(*size);
+        if (*base_addr != NULL) {
+            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
+            return;
+        }
+        // try again with half the size
+        *size /= 2;
+    } while (*size > 0);
+    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
+}
+static void free_measure_vmem(void * base_addr, size_t size) {
+    free_vmem(base_addr, size);
+}
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
     struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+    void * base_addr;
+    size_t size;
+    alloc_measure_vmem(&base_addr, &size);
     *alloc = (struct ggml_allocr){
-        /*.data          = */ MEASURE_BASE_ADDR,
-        /*.size          = */ MEASURE_MAX_SIZE,
+        /*.data          = */ base_addr,
+        /*.size          = */ size,
         /*.alignment     = */ alignment,
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
@@ -311,6 +392,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
 }
 void ggml_allocr_free(struct ggml_allocr * alloc) {
+    if (alloc->measure) {
+        free_measure_vmem(alloc->data, alloc->size);
+    }
     free(alloc);
 }
@@ -380,8 +464,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                     }
                     // if the node's data is external, then we cannot re-use it
-                    if ((char *) parent->data < (char *) alloc->data ||
-                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
+                    if (ggml_allocr_is_own(alloc, parent) == false) {
                         AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
                         continue;
                     }
@@ -415,7 +498,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
     }
 }
-static size_t ggml_allocator_alloc_graph_tensors_n(
+static size_t ggml_allocr_alloc_graph_tensors_n(
     struct ggml_allocr * alloc,
     struct ggml_cgraph ** graphs, int n_graphs,
     struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -493,11 +576,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                 AT_PRINTF("\n");
             }
             // update parents
             // update immediately if there is no parse_seq
             // update only at barriers if there is parse_seq
-            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
+            if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
                 int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
                 int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
                 for (int i = update_start; i < update_end; i++) {
@@ -521,12 +603,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                                 view_src_hn->n_views -= 1;
                                 AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
                                 if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
-                                    ggml_allocator_free_tensor(alloc, view_src);
+                                    ggml_allocr_free_tensor(alloc, view_src);
                                 }
                             }
                             else {
                                 if (parent->data != node->data) {
-                                    ggml_allocator_free_tensor(alloc, parent);
+                                    ggml_allocr_free_tensor(alloc, parent);
                                 }
                             }
                         }
@@ -543,7 +625,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
             for (int i = 0; outputs[g][i] != NULL; i++) {
                 struct ggml_tensor * output = outputs[g][i];
                 AT_PRINTF("output: %s\n", output->name);
-                ggml_allocator_free_tensor(alloc, output);
+                ggml_allocr_free_tensor(alloc, output);
             }
         }
     }
@@ -552,5 +634,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
 }
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }

data/ext/llama_cpp/src/ggml-cuda.cu CHANGED Viewed

@@ -81,12 +81,29 @@
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
 typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
 static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
     const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
     const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
     const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
     return reinterpret_cast<const int&>(c);
+#else
+    int8x4_t c;
+    int16_t tmp;
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        tmp = va[i] - vb[i];
+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+        c[i] = tmp;
+    }
+    return reinterpret_cast<int&>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
 }
 static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
     dst[i] = x[i] / (1.0f + expf(-x[i]));
 }
+static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
+        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
+    }
+    return a;
+}
+template <int block_size>
 static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
     const int tid = threadIdx.x;
     const float eps = 1e-5f;
-    float mean = 0.0f;
-    float var = 0.0f;
+    float2 mean_var = make_float2(0.f, 0.f);
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += block_size) {
         const float xi = x[row*ncols + col];
-        mean += xi;
-        var += xi * xi;
+        mean_var.x += xi;
+        mean_var.y += xi * xi;
     }
     // sum up partial sums
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
-        var += __shfl_xor_sync(0xffffffff, var, mask, 32);
+    mean_var = warp_reduce_sum(mean_var);
+    if (block_size > WARP_SIZE) {
+        __shared__ float2 s_sum[32];
+        int warp_id = threadIdx.x / WARP_SIZE;
+        int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = mean_var;
+        }
+        __syncthreads();
+        mean_var = s_sum[lane_id];
+        mean_var = warp_reduce_sum(mean_var);
     }
-    mean /= ncols;
-    var = var / ncols - mean * mean;
-    const float inv_var = rsqrtf(var + eps);
+    const float mean = mean_var.x / ncols;
+    const float var = mean_var.y / ncols - mean * mean;
+    const float inv_std = rsqrtf(var + eps);
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
+    }
+}
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
+static __device__ __forceinline__ float warp_reduce_sum(float x) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
     }
+    return x;
 }
+template <int block_size>
 static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
     const int tid = threadIdx.x;
     float tmp = 0.0f; // partial sum for thread in warp
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += block_size) {
         const float xi = x[row*ncols + col];
         tmp += xi * xi;
     }
     // sum up partial sums
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __shared__ float s_sum[32];
+        int warp_id = threadIdx.x / WARP_SIZE;
+        int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
     }
     const float mean = tmp / ncols;
     const float scale = rsqrtf(mean + eps);
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += block_size) {
         dst[row*ncols + col] = scale * x[row*ncols + col];
     }
 }
@@ -4186,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
 static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+    if (ncols < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+    }
 }
 static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+    if (ncols < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+    }
 }
 static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {

data/ext/llama_cpp/src/ggml-metal.m CHANGED Viewed

@@ -76,6 +76,7 @@ struct ggml_metal_context {
     GGML_METAL_DECL_KERNEL(rms_norm);
     GGML_METAL_DECL_KERNEL(norm);
     GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
     GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
     GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
     GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@@ -116,10 +117,24 @@ static NSString * const msl_library_source = @"see metal.metal";
 struct ggml_metal_context * ggml_metal_init(int n_cb) {
     metal_printf("%s: allocating\n", __func__);
-    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+    // Show all the Metal device instances in the system
+    NSArray * devices = MTLCopyAllDevices();
+    id <MTLDevice> device;
+    NSString * s;
+    for (device in devices) {
+        s = [device name];
+        metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
+    }
+    // Pick and show default Metal device
+    device = MTLCreateSystemDefaultDevice();
+    s = [device name];
+    metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
+    // Configure context
+    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+    ctx->device = device;
     ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
-    ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
     ctx->n_buffers = 0;
     ctx->concur_list_len = 0;
@@ -205,6 +220,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(rms_norm);
         GGML_METAL_ADD_KERNEL(norm);
         GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
         GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
         GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
         GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@@ -270,6 +286,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(rms_norm);
     GGML_METAL_DEL_KERNEL(norm);
     GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
     GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
     GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
     GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@@ -310,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
 void * ggml_metal_host_malloc(size_t n) {
     void * data = NULL;
-    const int result = posix_memalign((void **) &data, getpagesize(), n);
+    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
     if (result != 0) {
         metal_printf("%s: error: posix_memalign failed\n", __func__);
         return NULL;
@@ -384,7 +401,7 @@ bool ggml_metal_add_buffer(
             }
         }
-        const size_t size_page = getpagesize();
+        const size_t size_page = sysconf(_SC_PAGESIZE);
         size_t size_aligned = size;
         if ((size_aligned % size_page) != 0) {
@@ -854,7 +871,11 @@ void ggml_metal_graph_compute(
                                         {
                                             nth0 = 32;
                                             nth1 = 1;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
+                                            if (ne11 * ne12 < 4) {
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
+                                            } else {
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
+                                            }
                                         } break;
                                     case GGML_TYPE_Q4_0:
                                         {
@@ -906,8 +927,8 @@ void ggml_metal_graph_compute(
                                             GGML_ASSERT(ne02 == 1);
                                             GGML_ASSERT(ne12 == 1);
-                                            nth0 = 2;
-                                            nth1 = 32;
+                                            nth0 = 4; //1;
+                                            nth1 = 8; //32;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                         } break;
                                     case GGML_TYPE_Q5_K:
@@ -955,9 +976,12 @@ void ggml_metal_graph_compute(
                                 [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
                                 if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
+                                    src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
                                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
+                                else if (src0t == GGML_TYPE_Q4_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
                                 else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
                                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -971,8 +995,8 @@ void ggml_metal_graph_compute(
                                 else if (src0t == GGML_TYPE_Q6_K) {
                                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 } else {
-                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    int64_t ny = (ne11 + 3)/4;
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
                             }
                         } break;
@@ -1117,7 +1141,7 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
                             [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
                         } break;
                     case GGML_OP_DUP:
                     case GGML_OP_CPY: