llama_cpp 0.0.7 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,43 +1,18 @@
1
- #include <cublas_v2.h>
2
- #include <cuda_runtime.h>
1
+ #include "ggml.h"
3
2
 
4
3
  #ifdef __cplusplus
5
4
  extern "C" {
6
5
  #endif
7
6
 
8
- #define CUDA_CHECK(err) \
9
- do { \
10
- cudaError_t err_ = (err); \
11
- if (err_ != cudaSuccess) { \
12
- fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
13
- cudaGetErrorString(err_)); \
14
- exit(1); \
15
- } \
16
- } while (0)
17
-
18
- #define CUBLAS_CHECK(err) \
19
- do { \
20
- cublasStatus_t err_ = (err); \
21
- if (err_ != CUBLAS_STATUS_SUCCESS) { \
22
- fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
23
- exit(1); \
24
- } \
25
- } while (0)
26
-
27
- extern cublasHandle_t g_cublasH;
28
- extern cudaStream_t g_cudaStream;
29
-
30
7
  void ggml_init_cublas(void);
31
- void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
32
- void ggml_cuda_pool_free(void * ptr, size_t size);
33
8
 
34
- void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
35
- void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
36
- void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
37
- void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
38
- void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
39
- void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
40
- void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
9
+ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
10
+ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
+ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
12
+
13
+ // TODO: export these with GGML_API
14
+ void * ggml_cuda_host_malloc(size_t size);
15
+ void ggml_cuda_host_free(void * ptr);
41
16
 
42
17
  #ifdef __cplusplus
43
18
  }
@@ -3,12 +3,141 @@
3
3
  #define CL_TARGET_OPENCL_VERSION 110
4
4
  #include <clblast_c.h>
5
5
 
6
+ #include <stdlib.h>
6
7
  #include <stdio.h>
7
8
  #include <string.h>
8
9
 
9
10
  #include "ggml.h"
10
11
 
11
- #include "ggml-opencl-dequant.cl"
12
+ #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
+ const char * clblast_dequant = MULTILINE_QUOTE(
14
+
15
+ struct block_q4_0
16
+ {
17
+ float d;
18
+ uchar qs[16];
19
+ };
20
+
21
+ __kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
22
+ const uint i = get_global_id(0) / 32;
23
+ const uint l = get_local_id(0);
24
+
25
+ const float d = blocks[i].d;
26
+
27
+ const uchar vi = blocks[i].qs[l];
28
+
29
+ const uint index = i*32 + l*2;
30
+ result[index + 0] = ((vi & 0xf) - 8)*d;
31
+ result[index + 1] = ((vi >> 4) - 8)*d;
32
+ }
33
+
34
+ struct block_q4_1
35
+ {
36
+ float d;
37
+ float m;
38
+ uchar qs[16];
39
+ };
40
+
41
+ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
42
+ const uint i = get_global_id(0) / 32;
43
+ const uint l = get_local_id(0);
44
+
45
+ const float d = blocks[i].d;
46
+ const float m = blocks[i].m;
47
+
48
+ const uchar vi = blocks[i].qs[l];
49
+
50
+ const uint index = i*32 + l*2;
51
+ result[index + 0] = (vi & 0xf) * d + m;
52
+ result[index + 1] = (vi >> 4) * d + m;
53
+ }
54
+
55
+ struct block_q4_2
56
+ {
57
+ ushort d;
58
+ uchar qs[8];
59
+ };
60
+
61
+ __kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
62
+ const uint i = get_global_id(0) / 16;
63
+ const uint l = get_local_id(0);
64
+
65
+ const float d = vload_half(0, (__global half*) &blocks[i].d);
66
+
67
+ const uchar vi = blocks[i].qs[l];
68
+
69
+ const uint index = i*16 + l*2;
70
+ result[index + 0] = ((vi & 0xf) - 8)*d;
71
+ result[index + 1] = ((vi >> 4) - 8)*d;
72
+ }
73
+
74
+
75
+ struct block_q5_0
76
+ {
77
+ float d;
78
+ uint qh;
79
+ uchar qs[16];
80
+ };
81
+
82
+ __kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
83
+ const uint i = get_global_id(0) / 32;
84
+ const uint l = get_local_id(0);
85
+
86
+ const float d = blocks[i].d;
87
+
88
+ const uchar vi = blocks[i].qs[l];
89
+
90
+ const uint l2 = l * 2;
91
+
92
+ const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
93
+ const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
94
+
95
+ const uint index = i*32 + l2;
96
+ result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
97
+ result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
98
+ }
99
+
100
+ struct block_q5_1
101
+ {
102
+ ushort d;
103
+ ushort m;
104
+ uint qh;
105
+ uchar qs[16];
106
+ };
107
+
108
+ __kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
109
+ const uint i = get_global_id(0) / 32;
110
+ const uint l = get_local_id(0);
111
+
112
+ const float d = vload_half(0, (__global half*) &blocks[i].d);
113
+ const float m = vload_half(0, (__global half*) &blocks[i].m);
114
+
115
+ const uchar vi = blocks[i].qs[l];
116
+
117
+ const uint l2 = l * 2;
118
+
119
+ const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
120
+ const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
121
+
122
+ const uint index = i*32 + l2;
123
+ result[index + 0] = ((vi & 0xf) | vh0)*d + m;
124
+ result[index + 1] = ((vi >> 4) | vh1)*d + m;
125
+ }
126
+
127
+ struct block_q8_0
128
+ {
129
+ float d;
130
+ char qs[32];
131
+ };
132
+
133
+ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
134
+ const uint i = get_global_id(0) / 32;
135
+ const uint l = get_local_id(0);
136
+
137
+ result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
138
+ }
139
+
140
+ );
12
141
 
13
142
  #define CL_CHECK(err, name) \
14
143
  do { \
@@ -19,12 +148,26 @@
19
148
  } \
20
149
  } while (0)
21
150
 
151
+ #define QK5_0 32
152
+ typedef struct {
153
+ ggml_fp16_t d; // delta
154
+ uint8_t qh[4]; // 5-th bit of quants
155
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
156
+ } block_q5_0;
157
+
158
+
159
+ typedef struct {
160
+ float d; // delta
161
+ uint32_t qh; // 5-th bit of quants
162
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
163
+ } cl_block_q5_0;
164
+
22
165
  static cl_platform_id platform;
23
166
  static cl_device_id device;
24
167
  static cl_context context;
25
168
  static cl_command_queue queue;
26
169
  static cl_program program;
27
- static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3;
170
+ static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
28
171
  static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
29
172
  static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
30
173
 
@@ -97,7 +240,11 @@ void ggml_cl_init(void) {
97
240
  CL_CHECK(err, "clCreateKernel");
98
241
  kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
99
242
  CL_CHECK(err, "clCreateKernel");
100
- kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err);
243
+ kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
244
+ CL_CHECK(err, "clCreateKernel");
245
+ kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
246
+ CL_CHECK(err, "clCreateKernel");
247
+ kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
101
248
  CL_CHECK(err, "clCreateKernel");
102
249
  }
103
250
 
@@ -127,6 +274,7 @@ void ggml_cl_sgemm_wrapper(
127
274
  cl_kernel kernel;
128
275
  size_t global = n * k, local, size_qb;
129
276
  bool dequant;
277
+ cl_block_q5_0* cl_host_b;
130
278
 
131
279
  switch (btype) {
132
280
  case GGML_TYPE_F32:
@@ -148,13 +296,36 @@ void ggml_cl_sgemm_wrapper(
148
296
  dequant = true;
149
297
  kernel = kernel_q4_2;
150
298
  local = 8;
151
- size_qb = global * (sizeof(short) + local) / 16;
299
+ size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
152
300
  break;
153
- case GGML_TYPE_Q4_3:
301
+ case GGML_TYPE_Q5_0:
154
302
  dequant = true;
155
- kernel = kernel_q4_3;
156
- local = 8;
157
- size_qb = global * (sizeof(short) * 2 + local) / 16;
303
+ kernel = kernel_q5_0;
304
+ local = 16;
305
+ // For some reason OpenCL seems to be incapable of working with structs of size 22.
306
+ // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
307
+ // TODO Find the reason, fix and remove workaround.
308
+ const block_q5_0* b = (const block_q5_0*) host_b;
309
+ cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
310
+ for (size_t i = 0; i < global / 32; i++) {
311
+ cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
312
+ memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
313
+ memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
314
+ }
315
+ host_b = (const float*) cl_host_b;
316
+ size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
317
+ break;
318
+ case GGML_TYPE_Q5_1:
319
+ dequant = true;
320
+ kernel = kernel_q5_1;
321
+ local = 16;
322
+ size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
323
+ break;
324
+ case GGML_TYPE_Q8_0:
325
+ dequant = true;
326
+ kernel = kernel_q8_0;
327
+ local = 32;
328
+ size_qb = global * (sizeof(float) + local) / 32;
158
329
  break;
159
330
  default:
160
331
  fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
@@ -179,12 +350,15 @@ void ggml_cl_sgemm_wrapper(
179
350
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
180
351
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
181
352
  CL_CHECK(err, "clSetKernelArg");
182
- clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
353
+ err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
354
+ CL_CHECK(err, "clEnqueueWriteBuffer qb");
183
355
  } else {
184
- clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
356
+ err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
357
+ CL_CHECK(err, "clEnqueueWriteBuffer b");
185
358
  }
186
359
 
187
- clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
360
+ err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
361
+ CL_CHECK(err, "clEnqueueWriteBuffer a");
188
362
  if (dequant) {
189
363
  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
190
364
  CL_CHECK(err, "clEnqueueNDRangeKernel");
@@ -196,15 +370,20 @@ void ggml_cl_sgemm_wrapper(
196
370
  clReleaseEvent(ev_b);
197
371
 
198
372
  cl_event ev_sgemm;
199
- CLBlastSgemm((CLBlastLayout)order,
200
- (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
201
- m, n, k,
202
- alpha,
203
- cl_buffer_a, 0, lda,
204
- cl_buffer_b, 0, ldb,
205
- beta,
206
- cl_buffer_c, 0, ldc,
207
- &queue, &ev_sgemm);
373
+ CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
374
+ (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
375
+ m, n, k,
376
+ alpha,
377
+ cl_buffer_a, 0, lda,
378
+ cl_buffer_b, 0, ldb,
379
+ beta,
380
+ cl_buffer_c, 0, ldc,
381
+ &queue, &ev_sgemm);
382
+
383
+ if (status != CLBlastSuccess) {
384
+ fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
385
+ abort();
386
+ }
208
387
 
209
388
  cl_event ev_c;
210
389
  clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
@@ -213,4 +392,7 @@ void ggml_cl_sgemm_wrapper(
213
392
  clWaitForEvents(1, &ev_c);
214
393
  clReleaseEvent(ev_sgemm);
215
394
  clReleaseEvent(ev_c);
395
+ if (btype == GGML_TYPE_Q5_0) {
396
+ free((void*) cl_host_b);
397
+ }
216
398
  }