llama_cpp 0.0.7 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +829 -51
- data/ext/llama_cpp/src/ggml-cuda.h +9 -32
- data/ext/llama_cpp/src/ggml-opencl.c +169 -24
- data/ext/llama_cpp/src/ggml.c +6672 -4376
- data/ext/llama_cpp/src/ggml.h +250 -15
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +710 -217
- data/ext/llama_cpp/src/llama.h +75 -28
- data/lib/llama_cpp/client.rb +30 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +41 -7
- metadata +3 -3
@@ -1,43 +1,20 @@
|
|
1
|
-
#include
|
2
|
-
#include <cuda_runtime.h>
|
1
|
+
#include "ggml.h"
|
3
2
|
|
4
3
|
#ifdef __cplusplus
|
5
4
|
extern "C" {
|
6
5
|
#endif
|
7
6
|
|
8
|
-
|
9
|
-
do { \
|
10
|
-
cudaError_t err_ = (err); \
|
11
|
-
if (err_ != cudaSuccess) { \
|
12
|
-
fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
13
|
-
cudaGetErrorString(err_)); \
|
14
|
-
exit(1); \
|
15
|
-
} \
|
16
|
-
} while (0)
|
17
|
-
|
18
|
-
#define CUBLAS_CHECK(err) \
|
19
|
-
do { \
|
20
|
-
cublasStatus_t err_ = (err); \
|
21
|
-
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
22
|
-
fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
23
|
-
exit(1); \
|
24
|
-
} \
|
25
|
-
} while (0)
|
7
|
+
void ggml_init_cublas(void);
|
26
8
|
|
27
|
-
|
28
|
-
|
9
|
+
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
10
|
+
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
11
|
+
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
29
12
|
|
30
|
-
|
31
|
-
void *
|
32
|
-
void
|
13
|
+
// TODO: export these with GGML_API
|
14
|
+
void * ggml_cuda_host_malloc(size_t size);
|
15
|
+
void ggml_cuda_host_free(void * ptr);
|
33
16
|
|
34
|
-
void
|
35
|
-
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
36
|
-
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
37
|
-
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
38
|
-
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
39
|
-
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
40
|
-
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
17
|
+
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
|
41
18
|
|
42
19
|
#ifdef __cplusplus
|
43
20
|
}
|
@@ -3,12 +3,141 @@
|
|
3
3
|
#define CL_TARGET_OPENCL_VERSION 110
|
4
4
|
#include <clblast_c.h>
|
5
5
|
|
6
|
+
#include <stdlib.h>
|
6
7
|
#include <stdio.h>
|
7
8
|
#include <string.h>
|
8
9
|
|
9
10
|
#include "ggml.h"
|
10
11
|
|
11
|
-
#
|
12
|
+
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
13
|
+
const char * clblast_dequant = MULTILINE_QUOTE(
|
14
|
+
|
15
|
+
typedef uchar uint8_t;
|
16
|
+
typedef int int32_t;
|
17
|
+
typedef uint uint32_t;
|
18
|
+
|
19
|
+
constant uint QK4_0 = 32;
|
20
|
+
struct block_q4_0
|
21
|
+
{
|
22
|
+
float d;
|
23
|
+
uint8_t qs[QK4_0 / 2];
|
24
|
+
};
|
25
|
+
|
26
|
+
constant uint QK4_1 = 32;
|
27
|
+
struct block_q4_1
|
28
|
+
{
|
29
|
+
float d;
|
30
|
+
float m;
|
31
|
+
uint8_t qs[QK4_1 / 2];
|
32
|
+
};
|
33
|
+
|
34
|
+
constant uint QK5_0 = 32;
|
35
|
+
struct __attribute__ ((packed)) block_q5_0
|
36
|
+
{
|
37
|
+
half d;
|
38
|
+
uint32_t qh;
|
39
|
+
uint8_t qs[QK5_0 / 2];
|
40
|
+
};
|
41
|
+
|
42
|
+
constant uint QK5_1 = 32;
|
43
|
+
struct block_q5_1
|
44
|
+
{
|
45
|
+
half d;
|
46
|
+
half m;
|
47
|
+
uint32_t qh;
|
48
|
+
uint8_t qs[QK5_1 / 2];
|
49
|
+
};
|
50
|
+
|
51
|
+
constant uint QK8_0 = 32;
|
52
|
+
struct block_q8_0
|
53
|
+
{
|
54
|
+
float d;
|
55
|
+
uint8_t qs[QK8_0];
|
56
|
+
};
|
57
|
+
|
58
|
+
|
59
|
+
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
|
60
|
+
constant uint qk = QK4_0;
|
61
|
+
|
62
|
+
const uint i = get_global_id(0) / qk;
|
63
|
+
const uint j = get_local_id(0);
|
64
|
+
|
65
|
+
const float d = x[i].d;
|
66
|
+
|
67
|
+
const int x0 = (x[i].qs[j] & 0xf) - 8;
|
68
|
+
const int x1 = (x[i].qs[j] >> 4) - 8;
|
69
|
+
|
70
|
+
y[i*qk + j + 0 ] = x0*d;
|
71
|
+
y[i*qk + j + qk/2] = x1*d;
|
72
|
+
}
|
73
|
+
|
74
|
+
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
|
75
|
+
constant uint qk = QK4_1;
|
76
|
+
|
77
|
+
const uint i = get_global_id(0) / qk;
|
78
|
+
const uint j = get_local_id(0);
|
79
|
+
|
80
|
+
const float d = x[i].d;
|
81
|
+
const float m = x[i].m;
|
82
|
+
|
83
|
+
const int x0 = (x[i].qs[j] & 0xf);
|
84
|
+
const int x1 = (x[i].qs[j] >> 4);
|
85
|
+
|
86
|
+
y[i*qk + j + 0 ] = x0*d + m;
|
87
|
+
y[i*qk + j + qk/2] = x1*d + m;
|
88
|
+
}
|
89
|
+
|
90
|
+
__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
|
91
|
+
constant uint qk = QK5_0;
|
92
|
+
|
93
|
+
const uint i = get_global_id(0) / qk;
|
94
|
+
const uint j = get_local_id(0);
|
95
|
+
|
96
|
+
const float d = vload_half(0, (__global half*) &x[i].d);
|
97
|
+
|
98
|
+
uint32_t qh = x[i].qh;
|
99
|
+
|
100
|
+
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
101
|
+
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
102
|
+
|
103
|
+
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
|
104
|
+
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
|
105
|
+
|
106
|
+
y[i*qk + j + 0 ] = x0*d;
|
107
|
+
y[i*qk + j + qk/2] = x1*d;
|
108
|
+
}
|
109
|
+
|
110
|
+
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
|
111
|
+
constant uint qk = QK5_1;
|
112
|
+
|
113
|
+
const uint i = get_global_id(0) / qk;
|
114
|
+
const uint j = get_local_id(0);
|
115
|
+
|
116
|
+
const float d = vload_half(0, (__global half*) &x[i].d);
|
117
|
+
const float m = vload_half(0, (__global half*) &x[i].m);
|
118
|
+
|
119
|
+
uint32_t qh = x[i].qh;
|
120
|
+
|
121
|
+
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
122
|
+
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
123
|
+
|
124
|
+
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
|
125
|
+
const int x1 = (x[i].qs[j] >> 4) | xh_1;
|
126
|
+
|
127
|
+
y[i*qk + j + 0 ] = x0*d + m;
|
128
|
+
y[i*qk + j + qk/2] = x1*d + m;
|
129
|
+
}
|
130
|
+
|
131
|
+
__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
|
132
|
+
constant uint qk = QK8_0;
|
133
|
+
const uint i = get_global_id(0) / qk;
|
134
|
+
const uint j = get_local_id(0);
|
135
|
+
|
136
|
+
const float d = x[i].d;
|
137
|
+
y[i*qk + j] = x[i].qs[j]*d;
|
138
|
+
}
|
139
|
+
|
140
|
+
);
|
12
141
|
|
13
142
|
#define CL_CHECK(err, name) \
|
14
143
|
do { \
|
@@ -24,7 +153,7 @@ static cl_device_id device;
|
|
24
153
|
static cl_context context;
|
25
154
|
static cl_command_queue queue;
|
26
155
|
static cl_program program;
|
27
|
-
static cl_kernel kernel_q4_0, kernel_q4_1,
|
156
|
+
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
|
28
157
|
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
|
29
158
|
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
|
30
159
|
|
@@ -95,9 +224,11 @@ void ggml_cl_init(void) {
|
|
95
224
|
CL_CHECK(err, "clCreateKernel");
|
96
225
|
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
|
97
226
|
CL_CHECK(err, "clCreateKernel");
|
98
|
-
|
227
|
+
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
|
99
228
|
CL_CHECK(err, "clCreateKernel");
|
100
|
-
|
229
|
+
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
|
230
|
+
CL_CHECK(err, "clCreateKernel");
|
231
|
+
kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
|
101
232
|
CL_CHECK(err, "clCreateKernel");
|
102
233
|
}
|
103
234
|
|
@@ -144,17 +275,23 @@ void ggml_cl_sgemm_wrapper(
|
|
144
275
|
local = 16;
|
145
276
|
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
146
277
|
break;
|
147
|
-
case
|
278
|
+
case GGML_TYPE_Q5_0:
|
279
|
+
dequant = true;
|
280
|
+
kernel = kernel_q5_0;
|
281
|
+
local = 16;
|
282
|
+
size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
|
283
|
+
break;
|
284
|
+
case GGML_TYPE_Q5_1:
|
148
285
|
dequant = true;
|
149
|
-
kernel =
|
150
|
-
local =
|
151
|
-
size_qb = global * (sizeof(
|
286
|
+
kernel = kernel_q5_1;
|
287
|
+
local = 16;
|
288
|
+
size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
|
152
289
|
break;
|
153
|
-
case
|
290
|
+
case GGML_TYPE_Q8_0:
|
154
291
|
dequant = true;
|
155
|
-
kernel =
|
156
|
-
local =
|
157
|
-
size_qb = global * (sizeof(
|
292
|
+
kernel = kernel_q8_0;
|
293
|
+
local = 32;
|
294
|
+
size_qb = global * (sizeof(float) + local) / 32;
|
158
295
|
break;
|
159
296
|
default:
|
160
297
|
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
|
@@ -179,12 +316,15 @@ void ggml_cl_sgemm_wrapper(
|
|
179
316
|
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
|
180
317
|
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
|
181
318
|
CL_CHECK(err, "clSetKernelArg");
|
182
|
-
clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
|
319
|
+
err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
|
320
|
+
CL_CHECK(err, "clEnqueueWriteBuffer qb");
|
183
321
|
} else {
|
184
|
-
clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
|
322
|
+
err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
|
323
|
+
CL_CHECK(err, "clEnqueueWriteBuffer b");
|
185
324
|
}
|
186
325
|
|
187
|
-
clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
|
326
|
+
err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
|
327
|
+
CL_CHECK(err, "clEnqueueWriteBuffer a");
|
188
328
|
if (dequant) {
|
189
329
|
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
|
190
330
|
CL_CHECK(err, "clEnqueueNDRangeKernel");
|
@@ -196,15 +336,20 @@ void ggml_cl_sgemm_wrapper(
|
|
196
336
|
clReleaseEvent(ev_b);
|
197
337
|
|
198
338
|
cl_event ev_sgemm;
|
199
|
-
CLBlastSgemm((CLBlastLayout)order,
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
339
|
+
CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
|
340
|
+
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
|
341
|
+
m, n, k,
|
342
|
+
alpha,
|
343
|
+
cl_buffer_a, 0, lda,
|
344
|
+
cl_buffer_b, 0, ldb,
|
345
|
+
beta,
|
346
|
+
cl_buffer_c, 0, ldc,
|
347
|
+
&queue, &ev_sgemm);
|
348
|
+
|
349
|
+
if (status != CLBlastSuccess) {
|
350
|
+
fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
|
351
|
+
abort();
|
352
|
+
}
|
208
353
|
|
209
354
|
cl_event ev_c;
|
210
355
|
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
|