RubyGems - llama_cpp - Versions diffs - 0.0.5 → 0.0.7 - Mend

llama_cpp 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +22 -0
data/ext/llama_cpp/extconf.rb +24 -1
data/ext/llama_cpp/llama_cpp.cpp +72 -0
data/ext/llama_cpp/src/ggml-cuda.h +44 -0
data/ext/llama_cpp/src/ggml-opencl.c +216 -0
data/ext/llama_cpp/src/ggml-opencl.h +24 -0
data/ext/llama_cpp/src/ggml.c +2324 -969
data/ext/llama_cpp/src/ggml.h +656 -619
data/ext/llama_cpp/src/llama.cpp +269 -42
data/ext/llama_cpp/src/llama.h +22 -14
data/ext/llama_cpp/src/llama_util.h +15 -3
data/lib/llama_cpp/client.rb +151 -0
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +16 -8
data/sig/llama_cpp.rbs +26 -2
metadata +6 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2df0c858faac117b7317683fb7b9a52fc0eb4f7329f728ac6a209085af487142
-  data.tar.gz: 6b5c5d5d5d4e9020b92c7d76c12086fd77089ecc9c2181fb9d8157df5267da96
+  metadata.gz: 8dbb27d65bfca1fc1e0c080fccf6de6a499e3c829e0542e08cd82d0bedb64a28
+  data.tar.gz: d115f89c533c41296a330da96f7b719a2c649d7fbd9256aeb033f72d9c7cb190
 SHA512:
-  metadata.gz: 8e9d3ccdb8cdc9d4cb7b60f32a709c874953c357fdaccc057502e5761efdec62a0fc0b39929448203ffc4210dbf0ca2f6019dc13f88cf0db84b754f44fd77bea
-  data.tar.gz: 75fc1d6674c8d509ae0557308277d6d3d7e05f5a6fbea512c2472c46bea1de6e2541a67ec3dda43f874d7f64e6981b720aa1c722d3ec7ea3b96ae9084a4d201b
+  metadata.gz: f4a818135e9873ace4ca931542897d9037a5430baee209a04b248ffd03b44a1360120e3d1d01089a46af1f9dbf5fec3a18a4d6ccd540d6463089a7b806f20ec7
+  data.tar.gz: ad80dcc69d2e14c462cc22aea0456296af9cd5015d499bb8683766b21cec2ccf619f827e8ad3a4417cbbd5ec45944dd85837a157be82279cf66e6de6b5ae0bc8

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,27 @@
 ## [Unreleased]
+## [[0.0.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.6...v0.0.7)] - 2023-04-29
+- Bump bundled llama.cpp from master-12b5900 to master-11d9023.
+- Add Client class.
+- Add model file type constants.
+- Add getter and setter methods of use_mmap to ContextParams.
+- Add empty? method to Context.
+- Add clblast config option:
+  ```
+  $ gem install llama_cpp -- --with-clblast
+  ```
+## [[0.0.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.5...v0.0.6)] - 2023-04-22
+- Bump bundled llama.cpp from master-315a95a to master-12b5900.
+- Add model file type constants.
+- Add `model_quantize` module function to LLaMACpp.
+- Add cublas config option:
+  ```
+  $ gem install llama_cpp -- --with-cublas
+  ```
 ## [[0.0.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.4...v0.0.5)] - 2023-04-20
 - Bump bundled llama.cpp from master-c85e03d to master-315a95a.

data/ext/llama_cpp/extconf.rb CHANGED Viewed

@@ -5,6 +5,8 @@ require 'mkmf'
 abort 'libstdc++ is not found.' unless have_library('stdc++')
 $srcs = %w[ggml.c llama.cpp llama_cpp.cpp]
+$srcs << 'ggml-opencl.c' if with_config('clblast')
 $CFLAGS << ' -w'
 $CXXFLAGS << ' -std=c++11'
 $INCFLAGS << ' -I$(srcdir)/src'
@@ -23,8 +25,22 @@ if with_config('openblas')
 end
 if with_config('accelerate')
+  abort 'Accelerate framework is not found.' unless have_framework('Accelerate')
   $CFLAGS << ' -DGGML_USE_ACCELERATE'
-  $LDFLAGS << ' -framework Accelerate'
+end
+if with_config('cublas')
+  $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
+  $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
+  $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
+end
+if with_config('clblast')
+  abort 'libclblast is not found.' unless have_library('clblast')
+  abort 'libOpenCL is not found.' unless have_library('OpenCL')
+  $CFLAGS << ' -DGGML_USE_CLBLAST'
 end
 UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
@@ -49,3 +65,10 @@ end
 # rubocop:enable Layout/LineLength
 create_makefile('llama_cpp/llama_cpp')
+if with_config('cublas')
+  File.open('Makefile', 'a') do |f|
+    f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
+    f.puts "\tnvcc -arch=native -c -o $@ $<"
+  end
+end

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -53,6 +53,8 @@ public:
     rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
     rb_define_method(rb_cLLaMAContextParams, "vocab_only=", RUBY_METHOD_FUNC(_llama_context_params_set_vocab_only), 1);
     rb_define_method(rb_cLLaMAContextParams, "vocab_only", RUBY_METHOD_FUNC(_llama_context_params_get_vocab_only), 0);
+    rb_define_method(rb_cLLaMAContextParams, "use_mmap=", RUBY_METHOD_FUNC(_llama_context_params_set_use_mmap), 1);
+    rb_define_method(rb_cLLaMAContextParams, "use_mmap", RUBY_METHOD_FUNC(_llama_context_params_get_use_mmap), 0);
     rb_define_method(rb_cLLaMAContextParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_context_params_set_use_mlock), 1);
     rb_define_method(rb_cLLaMAContextParams, "use_mlock", RUBY_METHOD_FUNC(_llama_context_params_get_use_mlock), 0);
     rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
@@ -140,6 +142,18 @@ private:
     return ptr->params.vocab_only ? Qtrue : Qfalse;
   };
+  // use_mmap
+  static VALUE _llama_context_params_set_use_mmap(VALUE self, VALUE use_mmap) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    ptr->params.use_mmap = use_mmap == Qtrue ? true : false;
+    return ptr->params.use_mmap ? Qtrue : Qfalse;
+  };
+  static VALUE _llama_context_params_get_use_mmap(VALUE self) {
+    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
+    return ptr->params.use_mmap ? Qtrue : Qfalse;
+  };
   // use_mlock
   static VALUE _llama_context_params_set_use_mlock(VALUE self, VALUE use_mlock) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -226,6 +240,7 @@ public:
     rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
     rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
     rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
+    rb_define_method(rb_cLLaMAContext, "empty?", RUBY_METHOD_FUNC(_llama_context_empty), 0);
     rb_define_method(rb_cLLaMAContext, "free", RUBY_METHOD_FUNC(_llama_context_free), 0);
     rb_define_method(rb_cLLaMAContext, "load", RUBY_METHOD_FUNC(_llama_context_load), -1);
     rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
@@ -514,6 +529,14 @@ private:
     return Qnil;
   };
+  static VALUE _llama_context_empty(VALUE self) {
+    LLaMAContextWrapper* ptr = get_llama_context(self);
+    if (ptr->ctx != NULL) {
+      return Qfalse;
+    }
+    return Qtrue;
+  }
   static VALUE _llama_context_free(VALUE self) {
     LLaMAContextWrapper* ptr = get_llama_context(self);
     if (ptr->ctx != NULL) {
@@ -612,6 +635,43 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
 // module functions
+static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
+  VALUE kw_args = Qnil;
+  ID kw_table[4] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("ftype"), rb_intern("n_threads") };
+  VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
+  rb_scan_args(argc, argv, ":", &kw_args);
+  rb_get_kwargs(kw_args, kw_table, 3, 1, kw_values);
+  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
+    rb_raise(rb_eArgError, "input_path must be a string");
+    return Qnil;
+  }
+  if (!RB_TYPE_P(kw_values[1], T_STRING)) {
+    rb_raise(rb_eArgError, "output_path must be a string");
+    return Qnil;
+  }
+  if (!RB_INTEGER_TYPE_P(kw_values[2])) {
+    rb_raise(rb_eArgError, "ftype must be an integer");
+    return Qnil;
+  }
+  if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
+    rb_raise(rb_eArgError, "n_threads must be an integer");
+    return Qnil;
+  }
+  const char* input_path = StringValueCStr(kw_values[0]);
+  const char* output_path = StringValueCStr(kw_values[1]);
+  const int ftype = NUM2INT(kw_values[2]);
+  const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
+  if (llama_model_quantize(input_path, output_path, (llama_ftype)ftype, n_threads) != 0) {
+    rb_raise(rb_eRuntimeError, "Failed to quantize model");
+    return Qnil;
+  }
+  return Qnil;
+}
 static VALUE rb_llama_token_bos(VALUE self) {
   return INT2NUM(llama_token_bos());
 }
@@ -638,12 +698,24 @@ extern "C" void Init_llama_cpp(void) {
   RbLLaMAContext::define_class(rb_mLLaMACpp);
   RbLLaMAContextParams::define_class(rb_mLLaMACpp);
+  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
   rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
   rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
   rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
   rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
   rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_3", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
+  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
   std::stringstream ss_magic;
   ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;

data/ext/llama_cpp/src/ggml-cuda.h ADDED Viewed

@@ -0,0 +1,44 @@
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#ifdef  __cplusplus
+extern "C" {
+#endif
+#define CUDA_CHECK(err)                                                                 \
+    do {                                                                                \
+        cudaError_t err_ = (err);                                                       \
+        if (err_ != cudaSuccess) {                                                      \
+            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
+                cudaGetErrorString(err_));                                              \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+#define CUBLAS_CHECK(err)                                                               \
+    do {                                                                                \
+        cublasStatus_t err_ = (err);                                                    \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
+            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+extern cublasHandle_t g_cublasH;
+extern cudaStream_t   g_cudaStream;
+void   ggml_init_cublas(void);
+void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
+void   ggml_cuda_pool_free(void * ptr, size_t size);
+void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+#ifdef  __cplusplus
+}
+#endif

data/ext/llama_cpp/src/ggml-opencl.c ADDED Viewed

@@ -0,0 +1,216 @@
+#include "ggml-opencl.h"
+#define CL_TARGET_OPENCL_VERSION 110
+#include <clblast_c.h>
+#include <stdio.h>
+#include <string.h>
+#include "ggml.h"
+#include "ggml-opencl-dequant.cl"
+#define CL_CHECK(err, name)                                                                     \
+    do {                                                                                        \
+        cl_int err_ = (err);                                                                    \
+        if (err_ != CL_SUCCESS) {                                                               \
+            fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
+            exit(1);                                                                            \
+        }                                                                                       \
+    } while (0)
+static cl_platform_id platform;
+static cl_device_id device;
+static cl_context context;
+static cl_command_queue queue;
+static cl_program program;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3;
+static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
+static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
+    cl_program p;
+    char *program_log;
+    size_t program_size, log_size;
+    int err;
+    program_size = strlen(program_buffer);
+    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+    if(err < 0) {
+        fprintf(stderr, "OpenCL error creating program");
+        exit(1);
+    }
+    err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
+    if(err < 0) {
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        program_log = (char*) malloc(log_size + 1);
+        program_log[log_size] = '\0';
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+        printf("%s\n", program_log);
+        free(program_log);
+        exit(1);
+    }
+    return p;
+}
+void ggml_cl_init(void) {
+    cl_int err = 0;
+    char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
+    char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
+    int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
+    int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
+    printf("\nInitializing CLBlast (First Run)...");
+    printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
+    cl_uint num_platforms;
+    clGetPlatformIDs(0, NULL, &num_platforms);
+    cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+    clGetPlatformIDs(num_platforms, platforms, NULL);
+    platform = platforms[plat_num];
+    char platform_buffer[1024];
+    clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
+    cl_uint num_devices;
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+    cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+    device = devices[dev_num];
+    char device_buffer[1024];
+    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
+    printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
+    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+    CL_CHECK(err, "clCreateContext");
+    queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
+    CL_CHECK(err, "clCreateCommandQueue");
+    free(platforms);
+    free(devices);
+    program = build_program_from_source(context, device, clblast_dequant);
+    // Prepare dequantize kernels
+    kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err);
+    CL_CHECK(err, "clCreateKernel");
+}
+static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
+    if (req_size <= *cur_size) {
+        return;
+    }
+    // Reallocate buffer with enough space
+    if (*cur_size > 0) {
+        clReleaseMemObject(*buf);
+    }
+    cl_int err;
+    *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
+    *cur_size = req_size;
+    CL_CHECK(err, "clCreateBuffer");
+}
+void ggml_cl_sgemm_wrapper(
+        const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
+        const int m, const int n, const int k,
+        const float alpha, const void *host_a, const int lda,
+        const float *host_b, const int ldb, const float beta,
+        float *host_c, const int ldc, const int btype) {
+    cl_int err = 0;
+    cl_kernel kernel;
+    size_t global = n * k, local, size_qb;
+    bool dequant;
+    switch (btype) {
+    case GGML_TYPE_F32:
+        dequant = false;
+        break;
+    case GGML_TYPE_Q4_0:
+        dequant = true;
+        kernel = kernel_q4_0;
+        local = 16;
+        size_qb = global * (sizeof(float) + local) / 32;
+        break;
+    case GGML_TYPE_Q4_1:
+        dequant = true;
+        kernel = kernel_q4_1;
+        local = 16;
+        size_qb = global * (sizeof(float) * 2 + local) / 32;
+        break;
+    case GGML_TYPE_Q4_2:
+        dequant = true;
+        kernel = kernel_q4_2;
+        local = 8;
+        size_qb = global * (sizeof(short) + local) / 16;
+        break;
+    case GGML_TYPE_Q4_3:
+        dequant = true;
+        kernel = kernel_q4_3;
+        local = 8;
+        size_qb = global * (sizeof(short) * 2 + local) / 16;
+        break;
+    default:
+        fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
+        abort();
+    }
+    const size_t size_a =  m * k * sizeof(float);
+    const size_t size_b =  n * k * sizeof(float);
+    const size_t size_c =  m * n * sizeof(float);
+    // Prepare buffers
+    ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
+    if (dequant) {
+        ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
+    }
+    ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
+    ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
+    cl_event ev_a, ev_qb, ev_b;
+    if (dequant) {
+        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
+        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
+        CL_CHECK(err, "clSetKernelArg");
+        clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
+    } else {
+        clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
+    }
+    clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
+    if (dequant) {
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
+        CL_CHECK(err, "clEnqueueNDRangeKernel");
+        clReleaseEvent(ev_qb);
+    }
+    clWaitForEvents(1, &ev_a);
+    clWaitForEvents(1, &ev_b);
+    clReleaseEvent(ev_a);
+    clReleaseEvent(ev_b);
+    cl_event ev_sgemm;
+    CLBlastSgemm((CLBlastLayout)order,
+                 (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
+                 m, n, k,
+                 alpha,
+                 cl_buffer_a, 0, lda,
+                 cl_buffer_b, 0, ldb,
+                 beta,
+                 cl_buffer_c, 0, ldc,
+                 &queue, &ev_sgemm);
+    cl_event ev_c;
+    clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
+    // Wait for completion
+    clWaitForEvents(1, &ev_c);
+    clReleaseEvent(ev_sgemm);
+    clReleaseEvent(ev_c);
+}

data/ext/llama_cpp/src/ggml-opencl.h ADDED Viewed

@@ -0,0 +1,24 @@
+#pragma once
+#ifdef  __cplusplus
+extern "C" {
+#endif
+void ggml_cl_init(void);
+enum ggml_blas_order {
+    GGML_BLAS_ORDER_ROW_MAJOR = 101,
+    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
+};
+enum ggml_blas_op {
+    GGML_BLAS_OP_N = 111,
+    GGML_BLAS_OP_T = 112,
+    GGML_BLAS_OP_C = 113,
+};
+void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
+#ifdef  __cplusplus
+}
+#endif