llama_cpp 0.0.7 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,43 +1,20 @@
1
- #include <cublas_v2.h>
2
- #include <cuda_runtime.h>
1
+ #include "ggml.h"
3
2
 
4
3
  #ifdef __cplusplus
5
4
  extern "C" {
6
5
  #endif
7
6
 
8
- #define CUDA_CHECK(err) \
9
- do { \
10
- cudaError_t err_ = (err); \
11
- if (err_ != cudaSuccess) { \
12
- fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
13
- cudaGetErrorString(err_)); \
14
- exit(1); \
15
- } \
16
- } while (0)
17
-
18
- #define CUBLAS_CHECK(err) \
19
- do { \
20
- cublasStatus_t err_ = (err); \
21
- if (err_ != CUBLAS_STATUS_SUCCESS) { \
22
- fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
23
- exit(1); \
24
- } \
25
- } while (0)
7
+ void ggml_init_cublas(void);
26
8
 
27
- extern cublasHandle_t g_cublasH;
28
- extern cudaStream_t g_cudaStream;
9
+ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
10
+ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
+ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
29
12
 
30
- void ggml_init_cublas(void);
31
- void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
32
- void ggml_cuda_pool_free(void * ptr, size_t size);
13
+ // TODO: export these with GGML_API
14
+ void * ggml_cuda_host_malloc(size_t size);
15
+ void ggml_cuda_host_free(void * ptr);
33
16
 
34
- void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
35
- void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
36
- void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
37
- void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
38
- void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
39
- void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
40
- void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
17
+ void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
41
18
 
42
19
  #ifdef __cplusplus
43
20
  }
@@ -3,12 +3,141 @@
3
3
  #define CL_TARGET_OPENCL_VERSION 110
4
4
  #include <clblast_c.h>
5
5
 
6
+ #include <stdlib.h>
6
7
  #include <stdio.h>
7
8
  #include <string.h>
8
9
 
9
10
  #include "ggml.h"
10
11
 
11
- #include "ggml-opencl-dequant.cl"
12
+ #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
+ const char * clblast_dequant = MULTILINE_QUOTE(
14
+
15
+ typedef uchar uint8_t;
16
+ typedef int int32_t;
17
+ typedef uint uint32_t;
18
+
19
+ constant uint QK4_0 = 32;
20
+ struct block_q4_0
21
+ {
22
+ float d;
23
+ uint8_t qs[QK4_0 / 2];
24
+ };
25
+
26
+ constant uint QK4_1 = 32;
27
+ struct block_q4_1
28
+ {
29
+ float d;
30
+ float m;
31
+ uint8_t qs[QK4_1 / 2];
32
+ };
33
+
34
+ constant uint QK5_0 = 32;
35
+ struct __attribute__ ((packed)) block_q5_0
36
+ {
37
+ half d;
38
+ uint32_t qh;
39
+ uint8_t qs[QK5_0 / 2];
40
+ };
41
+
42
+ constant uint QK5_1 = 32;
43
+ struct block_q5_1
44
+ {
45
+ half d;
46
+ half m;
47
+ uint32_t qh;
48
+ uint8_t qs[QK5_1 / 2];
49
+ };
50
+
51
+ constant uint QK8_0 = 32;
52
+ struct block_q8_0
53
+ {
54
+ float d;
55
+ uint8_t qs[QK8_0];
56
+ };
57
+
58
+
59
+ __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
60
+ constant uint qk = QK4_0;
61
+
62
+ const uint i = get_global_id(0) / qk;
63
+ const uint j = get_local_id(0);
64
+
65
+ const float d = x[i].d;
66
+
67
+ const int x0 = (x[i].qs[j] & 0xf) - 8;
68
+ const int x1 = (x[i].qs[j] >> 4) - 8;
69
+
70
+ y[i*qk + j + 0 ] = x0*d;
71
+ y[i*qk + j + qk/2] = x1*d;
72
+ }
73
+
74
+ __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
75
+ constant uint qk = QK4_1;
76
+
77
+ const uint i = get_global_id(0) / qk;
78
+ const uint j = get_local_id(0);
79
+
80
+ const float d = x[i].d;
81
+ const float m = x[i].m;
82
+
83
+ const int x0 = (x[i].qs[j] & 0xf);
84
+ const int x1 = (x[i].qs[j] >> 4);
85
+
86
+ y[i*qk + j + 0 ] = x0*d + m;
87
+ y[i*qk + j + qk/2] = x1*d + m;
88
+ }
89
+
90
+ __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
91
+ constant uint qk = QK5_0;
92
+
93
+ const uint i = get_global_id(0) / qk;
94
+ const uint j = get_local_id(0);
95
+
96
+ const float d = vload_half(0, (__global half*) &x[i].d);
97
+
98
+ uint32_t qh = x[i].qh;
99
+
100
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
101
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
102
+
103
+ const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
104
+ const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
105
+
106
+ y[i*qk + j + 0 ] = x0*d;
107
+ y[i*qk + j + qk/2] = x1*d;
108
+ }
109
+
110
+ __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
111
+ constant uint qk = QK5_1;
112
+
113
+ const uint i = get_global_id(0) / qk;
114
+ const uint j = get_local_id(0);
115
+
116
+ const float d = vload_half(0, (__global half*) &x[i].d);
117
+ const float m = vload_half(0, (__global half*) &x[i].m);
118
+
119
+ uint32_t qh = x[i].qh;
120
+
121
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
122
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
123
+
124
+ const int x0 = (x[i].qs[j] & 0xf) | xh_0;
125
+ const int x1 = (x[i].qs[j] >> 4) | xh_1;
126
+
127
+ y[i*qk + j + 0 ] = x0*d + m;
128
+ y[i*qk + j + qk/2] = x1*d + m;
129
+ }
130
+
131
+ __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
132
+ constant uint qk = QK8_0;
133
+ const uint i = get_global_id(0) / qk;
134
+ const uint j = get_local_id(0);
135
+
136
+ const float d = x[i].d;
137
+ y[i*qk + j] = x[i].qs[j]*d;
138
+ }
139
+
140
+ );
12
141
 
13
142
  #define CL_CHECK(err, name) \
14
143
  do { \
@@ -24,7 +153,7 @@ static cl_device_id device;
24
153
  static cl_context context;
25
154
  static cl_command_queue queue;
26
155
  static cl_program program;
27
- static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3;
156
+ static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
28
157
  static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
29
158
  static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
30
159
 
@@ -95,9 +224,11 @@ void ggml_cl_init(void) {
95
224
  CL_CHECK(err, "clCreateKernel");
96
225
  kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
97
226
  CL_CHECK(err, "clCreateKernel");
98
- kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
227
+ kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
99
228
  CL_CHECK(err, "clCreateKernel");
100
- kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err);
229
+ kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
230
+ CL_CHECK(err, "clCreateKernel");
231
+ kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
101
232
  CL_CHECK(err, "clCreateKernel");
102
233
  }
103
234
 
@@ -144,17 +275,23 @@ void ggml_cl_sgemm_wrapper(
144
275
  local = 16;
145
276
  size_qb = global * (sizeof(float) * 2 + local) / 32;
146
277
  break;
147
- case GGML_TYPE_Q4_2:
278
+ case GGML_TYPE_Q5_0:
279
+ dequant = true;
280
+ kernel = kernel_q5_0;
281
+ local = 16;
282
+ size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
283
+ break;
284
+ case GGML_TYPE_Q5_1:
148
285
  dequant = true;
149
- kernel = kernel_q4_2;
150
- local = 8;
151
- size_qb = global * (sizeof(short) + local) / 16;
286
+ kernel = kernel_q5_1;
287
+ local = 16;
288
+ size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
152
289
  break;
153
- case GGML_TYPE_Q4_3:
290
+ case GGML_TYPE_Q8_0:
154
291
  dequant = true;
155
- kernel = kernel_q4_3;
156
- local = 8;
157
- size_qb = global * (sizeof(short) * 2 + local) / 16;
292
+ kernel = kernel_q8_0;
293
+ local = 32;
294
+ size_qb = global * (sizeof(float) + local) / 32;
158
295
  break;
159
296
  default:
160
297
  fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
@@ -179,12 +316,15 @@ void ggml_cl_sgemm_wrapper(
179
316
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
180
317
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
181
318
  CL_CHECK(err, "clSetKernelArg");
182
- clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
319
+ err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
320
+ CL_CHECK(err, "clEnqueueWriteBuffer qb");
183
321
  } else {
184
- clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
322
+ err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
323
+ CL_CHECK(err, "clEnqueueWriteBuffer b");
185
324
  }
186
325
 
187
- clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
326
+ err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
327
+ CL_CHECK(err, "clEnqueueWriteBuffer a");
188
328
  if (dequant) {
189
329
  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
190
330
  CL_CHECK(err, "clEnqueueNDRangeKernel");
@@ -196,15 +336,20 @@ void ggml_cl_sgemm_wrapper(
196
336
  clReleaseEvent(ev_b);
197
337
 
198
338
  cl_event ev_sgemm;
199
- CLBlastSgemm((CLBlastLayout)order,
200
- (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
201
- m, n, k,
202
- alpha,
203
- cl_buffer_a, 0, lda,
204
- cl_buffer_b, 0, ldb,
205
- beta,
206
- cl_buffer_c, 0, ldc,
207
- &queue, &ev_sgemm);
339
+ CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
340
+ (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
341
+ m, n, k,
342
+ alpha,
343
+ cl_buffer_a, 0, lda,
344
+ cl_buffer_b, 0, ldb,
345
+ beta,
346
+ cl_buffer_c, 0, ldc,
347
+ &queue, &ev_sgemm);
348
+
349
+ if (status != CLBlastSuccess) {
350
+ fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
351
+ abort();
352
+ }
208
353
 
209
354
  cl_event ev_c;
210
355
  clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);