RubyGems - llama_cpp - Versions diffs - 0.1.0 → 0.1.2 - Mend

llama_cpp 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +17 -0
data/ext/llama_cpp/extconf.rb +7 -0
data/ext/llama_cpp/llama_cpp.cpp +153 -21
data/ext/llama_cpp/src/ggml-cuda.h +4 -0
data/ext/llama_cpp/src/ggml-opencl.c +291 -215
data/ext/llama_cpp/src/ggml.c +4428 -2143
data/ext/llama_cpp/src/ggml.h +216 -13
data/ext/llama_cpp/src/llama-util.h +23 -23
data/ext/llama_cpp/src/llama.cpp +300 -149
data/ext/llama_cpp/src/llama.h +38 -25
data/lib/llama_cpp/client.rb +1 -3
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +2 -0
data/sig/llama_cpp.rbs +4 -4
metadata +2 -2

data/ext/llama_cpp/src/ggml-opencl.c CHANGED Viewed

@@ -10,164 +10,148 @@
 #include "ggml.h"
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
-const char * clblast_dequant = MULTILINE_QUOTE(
+static const char * program_source = MULTILINE_QUOTE(
-struct block_q4_0
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef int int32_t;
+typedef uint uint32_t;
+struct __attribute__ ((packed)) block_q4_0
 {
-    float d;
-    uchar qs[16];
+    half d;
+    uint8_t qs[16]; /* QK4_0 / 2 */
 };
-__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    const float d = blocks[i].d;
+struct __attribute__ ((packed)) block_q4_1
+{
+    half d;
+    half m;
+    uint8_t qs[16]; /* QK4_1 / 2 */
+};
-    const uchar vi = blocks[i].qs[l];
+struct __attribute__ ((packed)) block_q5_0
+{
+    half d;
+    uint32_t qh;
+    uint8_t qs[16]; /* QK5_0 / 2 */
+};
-    const uint index = i*32 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
+struct __attribute__ ((packed)) block_q5_1
+{
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[16]; /* QK5_1 / 2 */
+};
-struct block_q4_1
+struct __attribute__ ((packed)) block_q8_0
 {
-    float d;
-    float m;
-    uchar qs[16];
+    half d;
+    int8_t qs[32]; /* QK8_0 */
 };
-__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    const float d = blocks[i].d;
-    const float m = blocks[i].m;
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
+    const uint i = get_global_id(0) / 32; /* QK4_0 */
+    const uint j = get_local_id(0);
-    const uchar vi = blocks[i].qs[l];
+    const float d = vload_half(0, (__global half*) &x[i].d);
-    const uint index = i*32 + l*2;
-    result[index + 0] = (vi & 0xf) * d + m;
-    result[index + 1] = (vi >> 4) * d + m;
-}
+    const int x0 = (x[i].qs[j] & 0xf) - 8;
+    const int x1 = (x[i].qs[j] >>  4) - 8;
-struct block_q4_2
-{
-    ushort d;
-    uchar qs[8];
-};
+    y[i*32 + j + 0 ] = x0*d;
+    y[i*32 + j + 16] = x1*d;
+}
-__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 16;
-    const uint l = get_local_id(0);
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
+    const uint i = get_global_id(0) / 32; /* QK4_1 */
+    const uint j = get_local_id(0);
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
+    const float d = vload_half(0, (__global half*) &x[i].d);
+    const float m = vload_half(0, (__global half*) &x[i].m);
-    const uchar vi = blocks[i].qs[l];
+    const int x0 = (x[i].qs[j] & 0xf);
+    const int x1 = (x[i].qs[j] >>  4);
-    const uint index = i*16 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
+    y[i*32 + j + 0 ] = x0*d + m;
+    y[i*32 + j + 16] = x1*d + m;
 }
+__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
+    const uint i = get_global_id(0) / 32; /* QK5_0 */
+    const uint j = get_local_id(0);
-struct block_q5_0
-{
-    float d;
-    uint qh;
-    uchar qs[16];
-};
-__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    const float d = blocks[i].d;
+    const float d = vload_half(0, (__global half*) &x[i].d);
-    const uchar vi = blocks[i].qs[l];
+    uint32_t qh = x[i].qh;
-    const uint l2 = l * 2;
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
+    const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
-    const uint index = i*32 + l2;
-    result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
-    result[index + 1] = (((vi >>  4) | vh1) - 16)*d;
+    y[i*32 + j + 0 ] = x0*d;
+    y[i*32 + j + 16] = x1*d;
 }
-struct block_q5_1
-{
-    ushort d;
-    ushort m;
-    uint qh;
-    uchar qs[16];
-};
-__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
+__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
+    const uint i = get_global_id(0) / 32; /* QK5_1 */
+    const uint j = get_local_id(0);
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
-    const float m = vload_half(0, (__global half*) &blocks[i].m);
+    const float d = vload_half(0, (__global half*) &x[i].d);
+    const float m = vload_half(0, (__global half*) &x[i].m);
-    const uchar vi = blocks[i].qs[l];
+    uint32_t qh = x[i].qh;
-    const uint l2 = l * 2;
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
+    const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+    const int x1 = (x[i].qs[j] >>  4) | xh_1;
-    const uint index = i*32 + l2;
-    result[index + 0] = ((vi & 0xf) | vh0)*d + m;
-    result[index + 1] = ((vi >>  4) | vh1)*d + m;
+    y[i*32 + j + 0 ] = x0*d + m;
+    y[i*32 + j + 16] = x1*d + m;
 }
-struct block_q8_0
-{
-    float d;
-    char qs[32];
-};
+__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
+    const uint i = get_global_id(0) / 32; /* QK8_0 */
+    const uint j = get_local_id(0);
-__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
+    const float d = vload_half(0, (__global half*) &x[i].d);
+    y[i*32 + j] = x[i].qs[j]*d;
 }
 );
-#define CL_CHECK(err, name)                                                                     \
-    do {                                                                                        \
-        cl_int err_ = (err);                                                                    \
-        if (err_ != CL_SUCCESS) {                                                               \
-            fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
-            exit(1);                                                                            \
-        }                                                                                       \
+#define CL_CHECK(err)                                               \
+    do {                                                            \
+        cl_int err_ = (err);                                        \
+        if (err_ != CL_SUCCESS) {                                   \
+            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            exit(1);                                                \
+        }                                                           \
     } while (0)
-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-typedef struct {
-    float d;                // delta
-    uint32_t qh;          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} cl_block_q5_0;
+#define CLBLAST_CHECK(err)                                          \
+    do {                                                            \
+        CLBlastStatusCode err_ = (err);                             \
+        if (err_ != CLBlastSuccess) {                               \
+            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            exit(1);                                                \
+        }                                                           \
+    } while (0)
 static cl_platform_id platform;
 static cl_device_id device;
 static cl_context context;
 static cl_command_queue queue;
 static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
@@ -202,50 +186,174 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
 void ggml_cl_init(void) {
     cl_int err = 0;
-    char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
-    char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
-    int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
-    int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
-    printf("\nInitializing CLBlast (First Run)...");
-    printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
-    cl_uint num_platforms;
-    clGetPlatformIDs(0, NULL, &num_platforms);
-    cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
-    clGetPlatformIDs(num_platforms, platforms, NULL);
-    platform = platforms[plat_num];
-    char platform_buffer[1024];
-    clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
-    cl_uint num_devices;
-    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
-    cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
-    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
-    device = devices[dev_num];
-    char device_buffer[1024];
-    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
-    printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
-    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
-    CL_CHECK(err, "clCreateContext");
-    queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    CL_CHECK(err, "clCreateCommandQueue");
-    free(platforms);
-    free(devices);
-    program = build_program_from_source(context, device, clblast_dequant);
+    struct cl_device;
+    struct cl_platform {
+        cl_platform_id id;
+        unsigned number;
+        char name[128];
+        char vendor[128];
+        struct cl_device * devices;
+        unsigned n_devices;
+        struct cl_device * default_device;
+    };
+    struct cl_device {
+        struct cl_platform * platform;
+        cl_device_id id;
+        unsigned number;
+        cl_device_type type;
+        char name[128];
+    };
+    enum { NPLAT = 16, NDEV = 16 };
+    struct cl_platform platforms[NPLAT];
+    unsigned n_platforms = 0;
+    struct cl_device devices[NDEV];
+    unsigned n_devices = 0;
+    struct cl_device * default_device = NULL;
+    platform = NULL;
+    device = NULL;
+    cl_platform_id platform_ids[NPLAT];
+    CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
+    for (unsigned i = 0; i < n_platforms; i++) {
+        struct cl_platform * p = &platforms[i];
+        p->number = i;
+        p->id = platform_ids[i];
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
+        cl_device_id device_ids[NDEV];
+        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
+        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
+            p->n_devices = 0;
+        } else {
+            CL_CHECK(clGetDeviceIDsError);
+        }
+        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
+        p->default_device = NULL;
+        for (unsigned j = 0; j < p->n_devices; j++) {
+            struct cl_device * d = &devices[n_devices];
+            d->number = n_devices++;
+            d->id = device_ids[j];
+            d->platform = p;
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
+            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
+                p->default_device = d;
+            }
+        }
+        if (default_device == NULL && p->default_device != NULL) {
+            default_device = p->default_device;
+        }
+    }
+    if (n_devices == 0) {
+        fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
+        exit(1);
+    }
+    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
+    int user_platform_number = -1;
+    int user_device_number = -1;
+    unsigned n;
+    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
+        user_platform_number = (int)n;
+    }
+    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
+        user_device_number = (int)n;
+    }
+    struct cl_device * selected_devices = devices;
+    unsigned n_selected_devices = n_devices;
+    if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
+        for (unsigned i = 0; i < n_platforms; i++) {
+            struct cl_platform * p = &platforms[i];
+            if (strstr(p->name, user_platform_string) != NULL ||
+                strstr(p->vendor, user_platform_string) != NULL) {
+                user_platform_number = (int)i;
+                break;
+            }
+        }
+        if (user_platform_number == -1) {
+            fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
+            exit(1);
+        }
+    }
+    if (user_platform_number != -1) {
+        struct cl_platform * p = &platforms[user_platform_number];
+        selected_devices = p->devices;
+        n_selected_devices = p->n_devices;
+        default_device = p->default_device;
+        if (n_selected_devices == 0) {
+            fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
+            exit(1);
+        }
+    }
+    if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
+        for (unsigned i = 0; i < n_selected_devices; i++) {
+            struct cl_device * d = &selected_devices[i];
+            if (strstr(d->name, user_device_string) != NULL) {
+                user_device_number = d->number;
+                break;
+            }
+        }
+        if (user_device_number == -1) {
+            fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
+            exit(1);
+        }
+    }
+    if (user_device_number != -1) {
+        selected_devices = &devices[user_device_number];
+        n_selected_devices = 1;
+        default_device = &selected_devices[0];
+    }
+    GGML_ASSERT(n_selected_devices > 0);
+    if (default_device == NULL) {
+        default_device = &selected_devices[0];
+    }
+    fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
+    fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
+    if (default_device->type != CL_DEVICE_TYPE_GPU) {
+        fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
+    }
+    platform = default_device->platform->id;
+    device = default_device->id;
+    cl_context_properties properties[] = {
+        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
+    };
+    CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
+    CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
+        (err != CL_INVALID_PROPERTY && err != CL_INVALID_VALUE ? err :
+        (queue = clCreateCommandQueue(context, device, 0, &err), err)
+    )));
+    program = build_program_from_source(context, device, program_source);
     // Prepare dequantize kernels
-    kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
-    CL_CHECK(err, "clCreateKernel");
+    CL_CHECK((kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
+    CL_CHECK((kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
+    CL_CHECK((kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
+    CL_CHECK((kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
+    CL_CHECK((kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
 }
 static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
@@ -258,9 +366,8 @@ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags
         clReleaseMemObject(*buf);
     }
     cl_int err;
-    *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
+    CL_CHECK((*buf = clCreateBuffer(context, flags, req_size, NULL, &err), err));
     *cur_size = req_size;
-    CL_CHECK(err, "clCreateBuffer");
 }
 void ggml_cl_sgemm_wrapper(
@@ -269,12 +376,10 @@ void ggml_cl_sgemm_wrapper(
         const float alpha, const void *host_a, const int lda,
         const float *host_b, const int ldb, const float beta,
         float *host_c, const int ldc, const int btype) {
-    cl_int err = 0;
     cl_kernel kernel;
     size_t global = n * k, local, size_qb;
     bool dequant;
-    cl_block_q5_0* cl_host_b;
     switch (btype) {
     case GGML_TYPE_F32:
@@ -284,36 +389,19 @@ void ggml_cl_sgemm_wrapper(
         dequant = true;
         kernel = kernel_q4_0;
         local = 16;
-        size_qb = global * (sizeof(float) + local) / 32;
+        size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
         break;
     case GGML_TYPE_Q4_1:
         dequant = true;
         kernel = kernel_q4_1;
         local = 16;
-        size_qb = global * (sizeof(float) * 2 + local) / 32;
-        break;
-    case GGML_TYPE_Q4_2:
-        dequant = true;
-        kernel = kernel_q4_2;
-        local = 8;
-        size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
+        size_qb = global * (sizeof(ggml_fp16_t) * 2 + local) / 32;
         break;
     case GGML_TYPE_Q5_0:
         dequant = true;
         kernel = kernel_q5_0;
         local = 16;
-        // For some reason OpenCL seems to be incapable of working with structs of size 22.
-        // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
-        // TODO Find the reason, fix and remove workaround.
-        const block_q5_0* b = (const block_q5_0*) host_b;
-        cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
-        for (size_t i = 0; i < global / 32; i++) {
-            cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
-            memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
-            memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
-        }
-        host_b = (const float*) cl_host_b;
-        size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
+        size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
         break;
     case GGML_TYPE_Q5_1:
         dequant = true;
@@ -325,7 +413,7 @@ void ggml_cl_sgemm_wrapper(
         dequant = true;
         kernel = kernel_q8_0;
         local = 32;
-        size_qb = global * (sizeof(float) + local) / 32;
+        size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
         break;
     default:
         fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
@@ -347,52 +435,40 @@ void ggml_cl_sgemm_wrapper(
     cl_event ev_a, ev_qb, ev_b;
     if (dequant) {
-        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
-        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
-        CL_CHECK(err, "clSetKernelArg");
-        err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
-        CL_CHECK(err, "clEnqueueWriteBuffer qb");
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b));
+        CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb));
     } else {
-        err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
-        CL_CHECK(err, "clEnqueueWriteBuffer b");
+        CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b));
     }
-    err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
-    CL_CHECK(err, "clEnqueueWriteBuffer a");
+    CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a));
     if (dequant) {
-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
-        CL_CHECK(err, "clEnqueueNDRangeKernel");
-        clReleaseEvent(ev_qb);
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b));
+        CL_CHECK(clReleaseEvent(ev_qb));
     }
-    clWaitForEvents(1, &ev_a);
-    clWaitForEvents(1, &ev_b);
-    clReleaseEvent(ev_a);
-    clReleaseEvent(ev_b);
+    CL_CHECK(clWaitForEvents(1, &ev_a));
+    CL_CHECK(clWaitForEvents(1, &ev_b));
+    CL_CHECK(clReleaseEvent(ev_a));
+    CL_CHECK(clReleaseEvent(ev_b));
     cl_event ev_sgemm;
-    CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
-                                            (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
-                                            m, n, k,
-                                            alpha,
-                                            cl_buffer_a, 0, lda,
-                                            cl_buffer_b, 0, ldb,
-                                            beta,
-                                            cl_buffer_c, 0, ldc,
-                                            &queue, &ev_sgemm);
-    if (status != CLBlastSuccess) {
-        fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
-        abort();
-    }
+    CLBLAST_CHECK(CLBlastSgemm(
+        (CLBlastLayout)order,
+        (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
+        m, n, k,
+        alpha,
+        cl_buffer_a, 0, lda,
+        cl_buffer_b, 0, ldb,
+        beta,
+        cl_buffer_c, 0, ldc,
+        &queue, &ev_sgemm));
     cl_event ev_c;
-    clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
+    CL_CHECK(clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c));
     // Wait for completion
-    clWaitForEvents(1, &ev_c);
-    clReleaseEvent(ev_sgemm);
-    clReleaseEvent(ev_c);
-    if (btype == GGML_TYPE_Q5_0) {
-        free((void*) cl_host_b);
-    }
+    CL_CHECK(clWaitForEvents(1, &ev_c));
+    CL_CHECK(clReleaseEvent(ev_sgemm));
+    CL_CHECK(clReleaseEvent(ev_c));
 }