llama_cpp 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +26 -0
- data/ext/llama_cpp/src/ggml-cuda.h +32 -0
- data/ext/llama_cpp/src/ggml-opencl.c +216 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +1436 -624
- data/ext/llama_cpp/src/ggml.h +654 -627
- data/ext/llama_cpp/src/llama.cpp +212 -29
- data/ext/llama_cpp/src/llama.h +17 -13
- data/ext/llama_cpp/src/llama_util.h +15 -2
- data/lib/llama_cpp/client.rb +151 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -8
- data/sig/llama_cpp.rbs +16 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8dbb27d65bfca1fc1e0c080fccf6de6a499e3c829e0542e08cd82d0bedb64a28
|
4
|
+
data.tar.gz: d115f89c533c41296a330da96f7b719a2c649d7fbd9256aeb033f72d9c7cb190
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f4a818135e9873ace4ca931542897d9037a5430baee209a04b248ffd03b44a1360120e3d1d01089a46af1f9dbf5fec3a18a4d6ccd540d6463089a7b806f20ec7
|
7
|
+
data.tar.gz: ad80dcc69d2e14c462cc22aea0456296af9cd5015d499bb8683766b21cec2ccf619f827e8ad3a4417cbbd5ec45944dd85837a157be82279cf66e6de6b5ae0bc8
|
data/CHANGELOG.md
CHANGED
@@ -1,9 +1,21 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [[0.0.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.6...v0.0.7)] - 2023-04-29
|
4
|
+
|
5
|
+
- Bump bundled llama.cpp from master-12b5900 to master-11d9023.
|
6
|
+
- Add Client class.
|
7
|
+
- Add model file type constants.
|
8
|
+
- Add getter and setter methods of use_mmap to ContextParams.
|
9
|
+
- Add empty? method to Context.
|
10
|
+
- Add clblast config option:
|
11
|
+
```
|
12
|
+
$ gem install llama_cpp -- --with-clblast
|
13
|
+
```
|
14
|
+
|
3
15
|
## [[0.0.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.5...v0.0.6)] - 2023-04-22
|
4
16
|
|
5
17
|
- Bump bundled llama.cpp from master-315a95a to master-12b5900.
|
6
|
-
- Add model file type constants
|
18
|
+
- Add model file type constants.
|
7
19
|
- Add `model_quantize` module function to LLaMACpp.
|
8
20
|
- Add cublas config option:
|
9
21
|
```
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -5,6 +5,8 @@ require 'mkmf'
|
|
5
5
|
abort 'libstdc++ is not found.' unless have_library('stdc++')
|
6
6
|
|
7
7
|
$srcs = %w[ggml.c llama.cpp llama_cpp.cpp]
|
8
|
+
$srcs << 'ggml-opencl.c' if with_config('clblast')
|
9
|
+
|
8
10
|
$CFLAGS << ' -w'
|
9
11
|
$CXXFLAGS << ' -std=c++11'
|
10
12
|
$INCFLAGS << ' -I$(srcdir)/src'
|
@@ -34,6 +36,13 @@ if with_config('cublas')
|
|
34
36
|
$objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
|
35
37
|
end
|
36
38
|
|
39
|
+
if with_config('clblast')
|
40
|
+
abort 'libclblast is not found.' unless have_library('clblast')
|
41
|
+
abort 'libOpenCL is not found.' unless have_library('OpenCL')
|
42
|
+
|
43
|
+
$CFLAGS << ' -DGGML_USE_CLBLAST'
|
44
|
+
end
|
45
|
+
|
37
46
|
UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
|
38
47
|
|
39
48
|
# rubocop:disable Layout/LineLength
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -53,6 +53,8 @@ public:
|
|
53
53
|
rb_define_method(rb_cLLaMAContextParams, "logits_all", RUBY_METHOD_FUNC(_llama_context_params_get_logits_all), 0);
|
54
54
|
rb_define_method(rb_cLLaMAContextParams, "vocab_only=", RUBY_METHOD_FUNC(_llama_context_params_set_vocab_only), 1);
|
55
55
|
rb_define_method(rb_cLLaMAContextParams, "vocab_only", RUBY_METHOD_FUNC(_llama_context_params_get_vocab_only), 0);
|
56
|
+
rb_define_method(rb_cLLaMAContextParams, "use_mmap=", RUBY_METHOD_FUNC(_llama_context_params_set_use_mmap), 1);
|
57
|
+
rb_define_method(rb_cLLaMAContextParams, "use_mmap", RUBY_METHOD_FUNC(_llama_context_params_get_use_mmap), 0);
|
56
58
|
rb_define_method(rb_cLLaMAContextParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_context_params_set_use_mlock), 1);
|
57
59
|
rb_define_method(rb_cLLaMAContextParams, "use_mlock", RUBY_METHOD_FUNC(_llama_context_params_get_use_mlock), 0);
|
58
60
|
rb_define_method(rb_cLLaMAContextParams, "embedding=", RUBY_METHOD_FUNC(_llama_context_params_set_embedding), 1);
|
@@ -140,6 +142,18 @@ private:
|
|
140
142
|
return ptr->params.vocab_only ? Qtrue : Qfalse;
|
141
143
|
};
|
142
144
|
|
145
|
+
// use_mmap
|
146
|
+
static VALUE _llama_context_params_set_use_mmap(VALUE self, VALUE use_mmap) {
|
147
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
148
|
+
ptr->params.use_mmap = use_mmap == Qtrue ? true : false;
|
149
|
+
return ptr->params.use_mmap ? Qtrue : Qfalse;
|
150
|
+
};
|
151
|
+
|
152
|
+
static VALUE _llama_context_params_get_use_mmap(VALUE self) {
|
153
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
154
|
+
return ptr->params.use_mmap ? Qtrue : Qfalse;
|
155
|
+
};
|
156
|
+
|
143
157
|
// use_mlock
|
144
158
|
static VALUE _llama_context_params_set_use_mlock(VALUE self, VALUE use_mlock) {
|
145
159
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -226,6 +240,7 @@ public:
|
|
226
240
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
227
241
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
228
242
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
243
|
+
rb_define_method(rb_cLLaMAContext, "empty?", RUBY_METHOD_FUNC(_llama_context_empty), 0);
|
229
244
|
rb_define_method(rb_cLLaMAContext, "free", RUBY_METHOD_FUNC(_llama_context_free), 0);
|
230
245
|
rb_define_method(rb_cLLaMAContext, "load", RUBY_METHOD_FUNC(_llama_context_load), -1);
|
231
246
|
rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
|
@@ -514,6 +529,14 @@ private:
|
|
514
529
|
return Qnil;
|
515
530
|
};
|
516
531
|
|
532
|
+
static VALUE _llama_context_empty(VALUE self) {
|
533
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
534
|
+
if (ptr->ctx != NULL) {
|
535
|
+
return Qfalse;
|
536
|
+
}
|
537
|
+
return Qtrue;
|
538
|
+
}
|
539
|
+
|
517
540
|
static VALUE _llama_context_free(VALUE self) {
|
518
541
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
519
542
|
if (ptr->ctx != NULL) {
|
@@ -689,6 +712,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
689
712
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
|
690
713
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
|
691
714
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_3", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_3));
|
715
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
|
716
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
|
717
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));
|
692
718
|
|
693
719
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
|
694
720
|
std::stringstream ss_magic;
|
@@ -1,11 +1,43 @@
|
|
1
|
+
#include <cublas_v2.h>
|
2
|
+
#include <cuda_runtime.h>
|
3
|
+
|
1
4
|
#ifdef __cplusplus
|
2
5
|
extern "C" {
|
3
6
|
#endif
|
4
7
|
|
8
|
+
#define CUDA_CHECK(err) \
|
9
|
+
do { \
|
10
|
+
cudaError_t err_ = (err); \
|
11
|
+
if (err_ != cudaSuccess) { \
|
12
|
+
fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
13
|
+
cudaGetErrorString(err_)); \
|
14
|
+
exit(1); \
|
15
|
+
} \
|
16
|
+
} while (0)
|
17
|
+
|
18
|
+
#define CUBLAS_CHECK(err) \
|
19
|
+
do { \
|
20
|
+
cublasStatus_t err_ = (err); \
|
21
|
+
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
22
|
+
fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
23
|
+
exit(1); \
|
24
|
+
} \
|
25
|
+
} while (0)
|
26
|
+
|
27
|
+
extern cublasHandle_t g_cublasH;
|
28
|
+
extern cudaStream_t g_cudaStream;
|
29
|
+
|
30
|
+
void ggml_init_cublas(void);
|
31
|
+
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
|
32
|
+
void ggml_cuda_pool_free(void * ptr, size_t size);
|
33
|
+
|
5
34
|
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
6
35
|
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
7
36
|
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
8
37
|
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
38
|
+
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
39
|
+
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
40
|
+
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
9
41
|
|
10
42
|
#ifdef __cplusplus
|
11
43
|
}
|
@@ -0,0 +1,216 @@
|
|
1
|
+
#include "ggml-opencl.h"
|
2
|
+
|
3
|
+
#define CL_TARGET_OPENCL_VERSION 110
|
4
|
+
#include <clblast_c.h>
|
5
|
+
|
6
|
+
#include <stdio.h>
|
7
|
+
#include <string.h>
|
8
|
+
|
9
|
+
#include "ggml.h"
|
10
|
+
|
11
|
+
#include "ggml-opencl-dequant.cl"
|
12
|
+
|
13
|
+
#define CL_CHECK(err, name) \
|
14
|
+
do { \
|
15
|
+
cl_int err_ = (err); \
|
16
|
+
if (err_ != CL_SUCCESS) { \
|
17
|
+
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
|
18
|
+
exit(1); \
|
19
|
+
} \
|
20
|
+
} while (0)
|
21
|
+
|
22
|
+
static cl_platform_id platform;
|
23
|
+
static cl_device_id device;
|
24
|
+
static cl_context context;
|
25
|
+
static cl_command_queue queue;
|
26
|
+
static cl_program program;
|
27
|
+
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3;
|
28
|
+
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
|
29
|
+
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
|
30
|
+
|
31
|
+
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
32
|
+
cl_program p;
|
33
|
+
char *program_log;
|
34
|
+
size_t program_size, log_size;
|
35
|
+
int err;
|
36
|
+
|
37
|
+
program_size = strlen(program_buffer);
|
38
|
+
|
39
|
+
p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
|
40
|
+
if(err < 0) {
|
41
|
+
fprintf(stderr, "OpenCL error creating program");
|
42
|
+
exit(1);
|
43
|
+
}
|
44
|
+
|
45
|
+
err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
|
46
|
+
if(err < 0) {
|
47
|
+
|
48
|
+
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
49
|
+
program_log = (char*) malloc(log_size + 1);
|
50
|
+
program_log[log_size] = '\0';
|
51
|
+
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
|
52
|
+
printf("%s\n", program_log);
|
53
|
+
free(program_log);
|
54
|
+
exit(1);
|
55
|
+
}
|
56
|
+
|
57
|
+
return p;
|
58
|
+
}
|
59
|
+
|
60
|
+
void ggml_cl_init(void) {
|
61
|
+
cl_int err = 0;
|
62
|
+
char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
|
63
|
+
char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
|
64
|
+
int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
|
65
|
+
int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
|
66
|
+
printf("\nInitializing CLBlast (First Run)...");
|
67
|
+
printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
|
68
|
+
cl_uint num_platforms;
|
69
|
+
clGetPlatformIDs(0, NULL, &num_platforms);
|
70
|
+
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
|
71
|
+
clGetPlatformIDs(num_platforms, platforms, NULL);
|
72
|
+
platform = platforms[plat_num];
|
73
|
+
char platform_buffer[1024];
|
74
|
+
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
|
75
|
+
cl_uint num_devices;
|
76
|
+
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
77
|
+
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
78
|
+
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
79
|
+
device = devices[dev_num];
|
80
|
+
char device_buffer[1024];
|
81
|
+
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
|
82
|
+
printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
|
83
|
+
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
|
84
|
+
CL_CHECK(err, "clCreateContext");
|
85
|
+
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
|
86
|
+
CL_CHECK(err, "clCreateCommandQueue");
|
87
|
+
|
88
|
+
free(platforms);
|
89
|
+
free(devices);
|
90
|
+
|
91
|
+
program = build_program_from_source(context, device, clblast_dequant);
|
92
|
+
|
93
|
+
// Prepare dequantize kernels
|
94
|
+
kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
|
95
|
+
CL_CHECK(err, "clCreateKernel");
|
96
|
+
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
|
97
|
+
CL_CHECK(err, "clCreateKernel");
|
98
|
+
kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
|
99
|
+
CL_CHECK(err, "clCreateKernel");
|
100
|
+
kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err);
|
101
|
+
CL_CHECK(err, "clCreateKernel");
|
102
|
+
}
|
103
|
+
|
104
|
+
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
|
105
|
+
if (req_size <= *cur_size) {
|
106
|
+
return;
|
107
|
+
}
|
108
|
+
|
109
|
+
// Reallocate buffer with enough space
|
110
|
+
if (*cur_size > 0) {
|
111
|
+
clReleaseMemObject(*buf);
|
112
|
+
}
|
113
|
+
cl_int err;
|
114
|
+
*buf = clCreateBuffer(context, flags, req_size, NULL, &err);
|
115
|
+
*cur_size = req_size;
|
116
|
+
CL_CHECK(err, "clCreateBuffer");
|
117
|
+
}
|
118
|
+
|
119
|
+
void ggml_cl_sgemm_wrapper(
|
120
|
+
const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
|
121
|
+
const int m, const int n, const int k,
|
122
|
+
const float alpha, const void *host_a, const int lda,
|
123
|
+
const float *host_b, const int ldb, const float beta,
|
124
|
+
float *host_c, const int ldc, const int btype) {
|
125
|
+
cl_int err = 0;
|
126
|
+
|
127
|
+
cl_kernel kernel;
|
128
|
+
size_t global = n * k, local, size_qb;
|
129
|
+
bool dequant;
|
130
|
+
|
131
|
+
switch (btype) {
|
132
|
+
case GGML_TYPE_F32:
|
133
|
+
dequant = false;
|
134
|
+
break;
|
135
|
+
case GGML_TYPE_Q4_0:
|
136
|
+
dequant = true;
|
137
|
+
kernel = kernel_q4_0;
|
138
|
+
local = 16;
|
139
|
+
size_qb = global * (sizeof(float) + local) / 32;
|
140
|
+
break;
|
141
|
+
case GGML_TYPE_Q4_1:
|
142
|
+
dequant = true;
|
143
|
+
kernel = kernel_q4_1;
|
144
|
+
local = 16;
|
145
|
+
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
146
|
+
break;
|
147
|
+
case GGML_TYPE_Q4_2:
|
148
|
+
dequant = true;
|
149
|
+
kernel = kernel_q4_2;
|
150
|
+
local = 8;
|
151
|
+
size_qb = global * (sizeof(short) + local) / 16;
|
152
|
+
break;
|
153
|
+
case GGML_TYPE_Q4_3:
|
154
|
+
dequant = true;
|
155
|
+
kernel = kernel_q4_3;
|
156
|
+
local = 8;
|
157
|
+
size_qb = global * (sizeof(short) * 2 + local) / 16;
|
158
|
+
break;
|
159
|
+
default:
|
160
|
+
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
|
161
|
+
abort();
|
162
|
+
}
|
163
|
+
|
164
|
+
const size_t size_a = m * k * sizeof(float);
|
165
|
+
const size_t size_b = n * k * sizeof(float);
|
166
|
+
const size_t size_c = m * n * sizeof(float);
|
167
|
+
|
168
|
+
// Prepare buffers
|
169
|
+
ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
|
170
|
+
if (dequant) {
|
171
|
+
ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
|
172
|
+
}
|
173
|
+
ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
|
174
|
+
ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
|
175
|
+
|
176
|
+
cl_event ev_a, ev_qb, ev_b;
|
177
|
+
|
178
|
+
if (dequant) {
|
179
|
+
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
|
180
|
+
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
|
181
|
+
CL_CHECK(err, "clSetKernelArg");
|
182
|
+
clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
|
183
|
+
} else {
|
184
|
+
clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
|
185
|
+
}
|
186
|
+
|
187
|
+
clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
|
188
|
+
if (dequant) {
|
189
|
+
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
|
190
|
+
CL_CHECK(err, "clEnqueueNDRangeKernel");
|
191
|
+
clReleaseEvent(ev_qb);
|
192
|
+
}
|
193
|
+
clWaitForEvents(1, &ev_a);
|
194
|
+
clWaitForEvents(1, &ev_b);
|
195
|
+
clReleaseEvent(ev_a);
|
196
|
+
clReleaseEvent(ev_b);
|
197
|
+
|
198
|
+
cl_event ev_sgemm;
|
199
|
+
CLBlastSgemm((CLBlastLayout)order,
|
200
|
+
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
|
201
|
+
m, n, k,
|
202
|
+
alpha,
|
203
|
+
cl_buffer_a, 0, lda,
|
204
|
+
cl_buffer_b, 0, ldb,
|
205
|
+
beta,
|
206
|
+
cl_buffer_c, 0, ldc,
|
207
|
+
&queue, &ev_sgemm);
|
208
|
+
|
209
|
+
cl_event ev_c;
|
210
|
+
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
|
211
|
+
|
212
|
+
// Wait for completion
|
213
|
+
clWaitForEvents(1, &ev_c);
|
214
|
+
clReleaseEvent(ev_sgemm);
|
215
|
+
clReleaseEvent(ev_c);
|
216
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#ifdef __cplusplus
|
4
|
+
extern "C" {
|
5
|
+
#endif
|
6
|
+
|
7
|
+
void ggml_cl_init(void);
|
8
|
+
|
9
|
+
enum ggml_blas_order {
|
10
|
+
GGML_BLAS_ORDER_ROW_MAJOR = 101,
|
11
|
+
GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
|
12
|
+
};
|
13
|
+
|
14
|
+
enum ggml_blas_op {
|
15
|
+
GGML_BLAS_OP_N = 111,
|
16
|
+
GGML_BLAS_OP_T = 112,
|
17
|
+
GGML_BLAS_OP_C = 113,
|
18
|
+
};
|
19
|
+
|
20
|
+
void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
|
21
|
+
|
22
|
+
#ifdef __cplusplus
|
23
|
+
}
|
24
|
+
#endif
|