llama_cpp 0.0.7 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,43 +1,20 @@
1
- #include <cublas_v2.h>
2
- #include <cuda_runtime.h>
1
+ #include "ggml.h"
3
2
 
4
3
  #ifdef __cplusplus
5
4
  extern "C" {
6
5
  #endif
7
6
 
8
- #define CUDA_CHECK(err) \
9
- do { \
10
- cudaError_t err_ = (err); \
11
- if (err_ != cudaSuccess) { \
12
- fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
13
- cudaGetErrorString(err_)); \
14
- exit(1); \
15
- } \
16
- } while (0)
17
-
18
- #define CUBLAS_CHECK(err) \
19
- do { \
20
- cublasStatus_t err_ = (err); \
21
- if (err_ != CUBLAS_STATUS_SUCCESS) { \
22
- fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
23
- exit(1); \
24
- } \
25
- } while (0)
7
+ void ggml_init_cublas(void);
26
8
 
27
- extern cublasHandle_t g_cublasH;
28
- extern cudaStream_t g_cudaStream;
9
+ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
10
+ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
+ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
29
12
 
30
- void ggml_init_cublas(void);
31
- void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
32
- void ggml_cuda_pool_free(void * ptr, size_t size);
13
+ // TODO: export these with GGML_API
14
+ void * ggml_cuda_host_malloc(size_t size);
15
+ void ggml_cuda_host_free(void * ptr);
33
16
 
34
- void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
35
- void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
36
- void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
37
- void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
38
- void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
39
- void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
40
- void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
17
+ void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
41
18
 
42
19
  #ifdef __cplusplus
43
20
  }
@@ -3,12 +3,141 @@
3
3
  #define CL_TARGET_OPENCL_VERSION 110
4
4
  #include <clblast_c.h>
5
5
 
6
+ #include <stdlib.h>
6
7
  #include <stdio.h>
7
8
  #include <string.h>
8
9
 
9
10
  #include "ggml.h"
10
11
 
11
- #include "ggml-opencl-dequant.cl"
12
+ #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
+ const char * clblast_dequant = MULTILINE_QUOTE(
14
+
15
+ typedef uchar uint8_t;
16
+ typedef int int32_t;
17
+ typedef uint uint32_t;
18
+
19
+ constant uint QK4_0 = 32;
20
+ struct block_q4_0
21
+ {
22
+ float d;
23
+ uint8_t qs[QK4_0 / 2];
24
+ };
25
+
26
+ constant uint QK4_1 = 32;
27
+ struct block_q4_1
28
+ {
29
+ float d;
30
+ float m;
31
+ uint8_t qs[QK4_1 / 2];
32
+ };
33
+
34
+ constant uint QK5_0 = 32;
35
+ struct __attribute__ ((packed)) block_q5_0
36
+ {
37
+ half d;
38
+ uint32_t qh;
39
+ uint8_t qs[QK5_0 / 2];
40
+ };
41
+
42
+ constant uint QK5_1 = 32;
43
+ struct block_q5_1
44
+ {
45
+ half d;
46
+ half m;
47
+ uint32_t qh;
48
+ uint8_t qs[QK5_1 / 2];
49
+ };
50
+
51
+ constant uint QK8_0 = 32;
52
+ struct block_q8_0
53
+ {
54
+ float d;
55
+ uint8_t qs[QK8_0];
56
+ };
57
+
58
+
59
+ __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
60
+ constant uint qk = QK4_0;
61
+
62
+ const uint i = get_global_id(0) / qk;
63
+ const uint j = get_local_id(0);
64
+
65
+ const float d = x[i].d;
66
+
67
+ const int x0 = (x[i].qs[j] & 0xf) - 8;
68
+ const int x1 = (x[i].qs[j] >> 4) - 8;
69
+
70
+ y[i*qk + j + 0 ] = x0*d;
71
+ y[i*qk + j + qk/2] = x1*d;
72
+ }
73
+
74
+ __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
75
+ constant uint qk = QK4_1;
76
+
77
+ const uint i = get_global_id(0) / qk;
78
+ const uint j = get_local_id(0);
79
+
80
+ const float d = x[i].d;
81
+ const float m = x[i].m;
82
+
83
+ const int x0 = (x[i].qs[j] & 0xf);
84
+ const int x1 = (x[i].qs[j] >> 4);
85
+
86
+ y[i*qk + j + 0 ] = x0*d + m;
87
+ y[i*qk + j + qk/2] = x1*d + m;
88
+ }
89
+
90
+ __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
91
+ constant uint qk = QK5_0;
92
+
93
+ const uint i = get_global_id(0) / qk;
94
+ const uint j = get_local_id(0);
95
+
96
+ const float d = vload_half(0, (__global half*) &x[i].d);
97
+
98
+ uint32_t qh = x[i].qh;
99
+
100
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
101
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
102
+
103
+ const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
104
+ const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
105
+
106
+ y[i*qk + j + 0 ] = x0*d;
107
+ y[i*qk + j + qk/2] = x1*d;
108
+ }
109
+
110
+ __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
111
+ constant uint qk = QK5_1;
112
+
113
+ const uint i = get_global_id(0) / qk;
114
+ const uint j = get_local_id(0);
115
+
116
+ const float d = vload_half(0, (__global half*) &x[i].d);
117
+ const float m = vload_half(0, (__global half*) &x[i].m);
118
+
119
+ uint32_t qh = x[i].qh;
120
+
121
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
122
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
123
+
124
+ const int x0 = (x[i].qs[j] & 0xf) | xh_0;
125
+ const int x1 = (x[i].qs[j] >> 4) | xh_1;
126
+
127
+ y[i*qk + j + 0 ] = x0*d + m;
128
+ y[i*qk + j + qk/2] = x1*d + m;
129
+ }
130
+
131
+ __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
132
+ constant uint qk = QK8_0;
133
+ const uint i = get_global_id(0) / qk;
134
+ const uint j = get_local_id(0);
135
+
136
+ const float d = x[i].d;
137
+ y[i*qk + j] = x[i].qs[j]*d;
138
+ }
139
+
140
+ );
12
141
 
13
142
  #define CL_CHECK(err, name) \
14
143
  do { \
@@ -24,7 +153,7 @@ static cl_device_id device;
24
153
  static cl_context context;
25
154
  static cl_command_queue queue;
26
155
  static cl_program program;
27
- static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3;
156
+ static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
28
157
  static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
29
158
  static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
30
159
 
@@ -95,9 +224,11 @@ void ggml_cl_init(void) {
95
224
  CL_CHECK(err, "clCreateKernel");
96
225
  kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
97
226
  CL_CHECK(err, "clCreateKernel");
98
- kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
227
+ kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
99
228
  CL_CHECK(err, "clCreateKernel");
100
- kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err);
229
+ kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
230
+ CL_CHECK(err, "clCreateKernel");
231
+ kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
101
232
  CL_CHECK(err, "clCreateKernel");
102
233
  }
103
234
 
@@ -144,17 +275,23 @@ void ggml_cl_sgemm_wrapper(
144
275
  local = 16;
145
276
  size_qb = global * (sizeof(float) * 2 + local) / 32;
146
277
  break;
147
- case GGML_TYPE_Q4_2:
278
+ case GGML_TYPE_Q5_0:
279
+ dequant = true;
280
+ kernel = kernel_q5_0;
281
+ local = 16;
282
+ size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
283
+ break;
284
+ case GGML_TYPE_Q5_1:
148
285
  dequant = true;
149
- kernel = kernel_q4_2;
150
- local = 8;
151
- size_qb = global * (sizeof(short) + local) / 16;
286
+ kernel = kernel_q5_1;
287
+ local = 16;
288
+ size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
152
289
  break;
153
- case GGML_TYPE_Q4_3:
290
+ case GGML_TYPE_Q8_0:
154
291
  dequant = true;
155
- kernel = kernel_q4_3;
156
- local = 8;
157
- size_qb = global * (sizeof(short) * 2 + local) / 16;
292
+ kernel = kernel_q8_0;
293
+ local = 32;
294
+ size_qb = global * (sizeof(float) + local) / 32;
158
295
  break;
159
296
  default:
160
297
  fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
@@ -179,12 +316,15 @@ void ggml_cl_sgemm_wrapper(
179
316
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
180
317
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
181
318
  CL_CHECK(err, "clSetKernelArg");
182
- clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
319
+ err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
320
+ CL_CHECK(err, "clEnqueueWriteBuffer qb");
183
321
  } else {
184
- clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
322
+ err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
323
+ CL_CHECK(err, "clEnqueueWriteBuffer b");
185
324
  }
186
325
 
187
- clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
326
+ err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
327
+ CL_CHECK(err, "clEnqueueWriteBuffer a");
188
328
  if (dequant) {
189
329
  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
190
330
  CL_CHECK(err, "clEnqueueNDRangeKernel");
@@ -196,15 +336,20 @@ void ggml_cl_sgemm_wrapper(
196
336
  clReleaseEvent(ev_b);
197
337
 
198
338
  cl_event ev_sgemm;
199
- CLBlastSgemm((CLBlastLayout)order,
200
- (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
201
- m, n, k,
202
- alpha,
203
- cl_buffer_a, 0, lda,
204
- cl_buffer_b, 0, ldb,
205
- beta,
206
- cl_buffer_c, 0, ldc,
207
- &queue, &ev_sgemm);
339
+ CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
340
+ (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
341
+ m, n, k,
342
+ alpha,
343
+ cl_buffer_a, 0, lda,
344
+ cl_buffer_b, 0, ldb,
345
+ beta,
346
+ cl_buffer_c, 0, ldc,
347
+ &queue, &ev_sgemm);
348
+
349
+ if (status != CLBlastSuccess) {
350
+ fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
351
+ abort();
352
+ }
208
353
 
209
354
  cl_event ev_c;
210
355
  clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);