llama_cpp 0.0.6 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,11 +1,18 @@
1
+ #include "ggml.h"
2
+
1
3
  #ifdef __cplusplus
2
4
  extern "C" {
3
5
  #endif
4
6
 
5
- void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
6
- void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
7
- void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
8
- void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
7
+ void ggml_init_cublas(void);
8
+
9
+ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
10
+ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
+ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
12
+
13
+ // TODO: export these with GGML_API
14
+ void * ggml_cuda_host_malloc(size_t size);
15
+ void ggml_cuda_host_free(void * ptr);
9
16
 
10
17
  #ifdef __cplusplus
11
18
  }
@@ -0,0 +1,398 @@
1
+ #include "ggml-opencl.h"
2
+
3
+ #define CL_TARGET_OPENCL_VERSION 110
4
+ #include <clblast_c.h>
5
+
6
+ #include <stdlib.h>
7
+ #include <stdio.h>
8
+ #include <string.h>
9
+
10
+ #include "ggml.h"
11
+
12
+ #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
+ const char * clblast_dequant = MULTILINE_QUOTE(
14
+
15
+ struct block_q4_0
16
+ {
17
+ float d;
18
+ uchar qs[16];
19
+ };
20
+
21
+ __kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
22
+ const uint i = get_global_id(0) / 32;
23
+ const uint l = get_local_id(0);
24
+
25
+ const float d = blocks[i].d;
26
+
27
+ const uchar vi = blocks[i].qs[l];
28
+
29
+ const uint index = i*32 + l*2;
30
+ result[index + 0] = ((vi & 0xf) - 8)*d;
31
+ result[index + 1] = ((vi >> 4) - 8)*d;
32
+ }
33
+
34
+ struct block_q4_1
35
+ {
36
+ float d;
37
+ float m;
38
+ uchar qs[16];
39
+ };
40
+
41
+ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
42
+ const uint i = get_global_id(0) / 32;
43
+ const uint l = get_local_id(0);
44
+
45
+ const float d = blocks[i].d;
46
+ const float m = blocks[i].m;
47
+
48
+ const uchar vi = blocks[i].qs[l];
49
+
50
+ const uint index = i*32 + l*2;
51
+ result[index + 0] = (vi & 0xf) * d + m;
52
+ result[index + 1] = (vi >> 4) * d + m;
53
+ }
54
+
55
+ struct block_q4_2
56
+ {
57
+ ushort d;
58
+ uchar qs[8];
59
+ };
60
+
61
+ __kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
62
+ const uint i = get_global_id(0) / 16;
63
+ const uint l = get_local_id(0);
64
+
65
+ const float d = vload_half(0, (__global half*) &blocks[i].d);
66
+
67
+ const uchar vi = blocks[i].qs[l];
68
+
69
+ const uint index = i*16 + l*2;
70
+ result[index + 0] = ((vi & 0xf) - 8)*d;
71
+ result[index + 1] = ((vi >> 4) - 8)*d;
72
+ }
73
+
74
+
75
+ struct block_q5_0
76
+ {
77
+ float d;
78
+ uint qh;
79
+ uchar qs[16];
80
+ };
81
+
82
+ __kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
83
+ const uint i = get_global_id(0) / 32;
84
+ const uint l = get_local_id(0);
85
+
86
+ const float d = blocks[i].d;
87
+
88
+ const uchar vi = blocks[i].qs[l];
89
+
90
+ const uint l2 = l * 2;
91
+
92
+ const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
93
+ const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
94
+
95
+ const uint index = i*32 + l2;
96
+ result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
97
+ result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
98
+ }
99
+
100
+ struct block_q5_1
101
+ {
102
+ ushort d;
103
+ ushort m;
104
+ uint qh;
105
+ uchar qs[16];
106
+ };
107
+
108
+ __kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
109
+ const uint i = get_global_id(0) / 32;
110
+ const uint l = get_local_id(0);
111
+
112
+ const float d = vload_half(0, (__global half*) &blocks[i].d);
113
+ const float m = vload_half(0, (__global half*) &blocks[i].m);
114
+
115
+ const uchar vi = blocks[i].qs[l];
116
+
117
+ const uint l2 = l * 2;
118
+
119
+ const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
120
+ const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
121
+
122
+ const uint index = i*32 + l2;
123
+ result[index + 0] = ((vi & 0xf) | vh0)*d + m;
124
+ result[index + 1] = ((vi >> 4) | vh1)*d + m;
125
+ }
126
+
127
+ struct block_q8_0
128
+ {
129
+ float d;
130
+ char qs[32];
131
+ };
132
+
133
+ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
134
+ const uint i = get_global_id(0) / 32;
135
+ const uint l = get_local_id(0);
136
+
137
+ result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
138
+ }
139
+
140
+ );
141
+
142
+ #define CL_CHECK(err, name) \
143
+ do { \
144
+ cl_int err_ = (err); \
145
+ if (err_ != CL_SUCCESS) { \
146
+ fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
147
+ exit(1); \
148
+ } \
149
+ } while (0)
150
+
151
+ #define QK5_0 32
152
+ typedef struct {
153
+ ggml_fp16_t d; // delta
154
+ uint8_t qh[4]; // 5-th bit of quants
155
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
156
+ } block_q5_0;
157
+
158
+
159
+ typedef struct {
160
+ float d; // delta
161
+ uint32_t qh; // 5-th bit of quants
162
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
163
+ } cl_block_q5_0;
164
+
165
+ static cl_platform_id platform;
166
+ static cl_device_id device;
167
+ static cl_context context;
168
+ static cl_command_queue queue;
169
+ static cl_program program;
170
+ static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
171
+ static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
172
+ static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
173
+
174
+ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
175
+ cl_program p;
176
+ char *program_log;
177
+ size_t program_size, log_size;
178
+ int err;
179
+
180
+ program_size = strlen(program_buffer);
181
+
182
+ p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
183
+ if(err < 0) {
184
+ fprintf(stderr, "OpenCL error creating program");
185
+ exit(1);
186
+ }
187
+
188
+ err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
189
+ if(err < 0) {
190
+
191
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
192
+ program_log = (char*) malloc(log_size + 1);
193
+ program_log[log_size] = '\0';
194
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
195
+ printf("%s\n", program_log);
196
+ free(program_log);
197
+ exit(1);
198
+ }
199
+
200
+ return p;
201
+ }
202
+
203
+ void ggml_cl_init(void) {
204
+ cl_int err = 0;
205
+ char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
206
+ char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
207
+ int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
208
+ int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
209
+ printf("\nInitializing CLBlast (First Run)...");
210
+ printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
211
+ cl_uint num_platforms;
212
+ clGetPlatformIDs(0, NULL, &num_platforms);
213
+ cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
214
+ clGetPlatformIDs(num_platforms, platforms, NULL);
215
+ platform = platforms[plat_num];
216
+ char platform_buffer[1024];
217
+ clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
218
+ cl_uint num_devices;
219
+ clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
220
+ cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
221
+ clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
222
+ device = devices[dev_num];
223
+ char device_buffer[1024];
224
+ clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
225
+ printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
226
+ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
227
+ CL_CHECK(err, "clCreateContext");
228
+ queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
229
+ CL_CHECK(err, "clCreateCommandQueue");
230
+
231
+ free(platforms);
232
+ free(devices);
233
+
234
+ program = build_program_from_source(context, device, clblast_dequant);
235
+
236
+ // Prepare dequantize kernels
237
+ kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
238
+ CL_CHECK(err, "clCreateKernel");
239
+ kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
240
+ CL_CHECK(err, "clCreateKernel");
241
+ kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
242
+ CL_CHECK(err, "clCreateKernel");
243
+ kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
244
+ CL_CHECK(err, "clCreateKernel");
245
+ kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
246
+ CL_CHECK(err, "clCreateKernel");
247
+ kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
248
+ CL_CHECK(err, "clCreateKernel");
249
+ }
250
+
251
+ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
252
+ if (req_size <= *cur_size) {
253
+ return;
254
+ }
255
+
256
+ // Reallocate buffer with enough space
257
+ if (*cur_size > 0) {
258
+ clReleaseMemObject(*buf);
259
+ }
260
+ cl_int err;
261
+ *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
262
+ *cur_size = req_size;
263
+ CL_CHECK(err, "clCreateBuffer");
264
+ }
265
+
266
+ void ggml_cl_sgemm_wrapper(
267
+ const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
268
+ const int m, const int n, const int k,
269
+ const float alpha, const void *host_a, const int lda,
270
+ const float *host_b, const int ldb, const float beta,
271
+ float *host_c, const int ldc, const int btype) {
272
+ cl_int err = 0;
273
+
274
+ cl_kernel kernel;
275
+ size_t global = n * k, local, size_qb;
276
+ bool dequant;
277
+ cl_block_q5_0* cl_host_b;
278
+
279
+ switch (btype) {
280
+ case GGML_TYPE_F32:
281
+ dequant = false;
282
+ break;
283
+ case GGML_TYPE_Q4_0:
284
+ dequant = true;
285
+ kernel = kernel_q4_0;
286
+ local = 16;
287
+ size_qb = global * (sizeof(float) + local) / 32;
288
+ break;
289
+ case GGML_TYPE_Q4_1:
290
+ dequant = true;
291
+ kernel = kernel_q4_1;
292
+ local = 16;
293
+ size_qb = global * (sizeof(float) * 2 + local) / 32;
294
+ break;
295
+ case GGML_TYPE_Q4_2:
296
+ dequant = true;
297
+ kernel = kernel_q4_2;
298
+ local = 8;
299
+ size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
300
+ break;
301
+ case GGML_TYPE_Q5_0:
302
+ dequant = true;
303
+ kernel = kernel_q5_0;
304
+ local = 16;
305
+ // For some reason OpenCL seems to be incapable of working with structs of size 22.
306
+ // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
307
+ // TODO Find the reason, fix and remove workaround.
308
+ const block_q5_0* b = (const block_q5_0*) host_b;
309
+ cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
310
+ for (size_t i = 0; i < global / 32; i++) {
311
+ cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
312
+ memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
313
+ memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
314
+ }
315
+ host_b = (const float*) cl_host_b;
316
+ size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
317
+ break;
318
+ case GGML_TYPE_Q5_1:
319
+ dequant = true;
320
+ kernel = kernel_q5_1;
321
+ local = 16;
322
+ size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
323
+ break;
324
+ case GGML_TYPE_Q8_0:
325
+ dequant = true;
326
+ kernel = kernel_q8_0;
327
+ local = 32;
328
+ size_qb = global * (sizeof(float) + local) / 32;
329
+ break;
330
+ default:
331
+ fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
332
+ abort();
333
+ }
334
+
335
+ const size_t size_a = m * k * sizeof(float);
336
+ const size_t size_b = n * k * sizeof(float);
337
+ const size_t size_c = m * n * sizeof(float);
338
+
339
+ // Prepare buffers
340
+ ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
341
+ if (dequant) {
342
+ ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
343
+ }
344
+ ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
345
+ ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
346
+
347
+ cl_event ev_a, ev_qb, ev_b;
348
+
349
+ if (dequant) {
350
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
351
+ err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
352
+ CL_CHECK(err, "clSetKernelArg");
353
+ err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
354
+ CL_CHECK(err, "clEnqueueWriteBuffer qb");
355
+ } else {
356
+ err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
357
+ CL_CHECK(err, "clEnqueueWriteBuffer b");
358
+ }
359
+
360
+ err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
361
+ CL_CHECK(err, "clEnqueueWriteBuffer a");
362
+ if (dequant) {
363
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
364
+ CL_CHECK(err, "clEnqueueNDRangeKernel");
365
+ clReleaseEvent(ev_qb);
366
+ }
367
+ clWaitForEvents(1, &ev_a);
368
+ clWaitForEvents(1, &ev_b);
369
+ clReleaseEvent(ev_a);
370
+ clReleaseEvent(ev_b);
371
+
372
+ cl_event ev_sgemm;
373
+ CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
374
+ (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
375
+ m, n, k,
376
+ alpha,
377
+ cl_buffer_a, 0, lda,
378
+ cl_buffer_b, 0, ldb,
379
+ beta,
380
+ cl_buffer_c, 0, ldc,
381
+ &queue, &ev_sgemm);
382
+
383
+ if (status != CLBlastSuccess) {
384
+ fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
385
+ abort();
386
+ }
387
+
388
+ cl_event ev_c;
389
+ clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
390
+
391
+ // Wait for completion
392
+ clWaitForEvents(1, &ev_c);
393
+ clReleaseEvent(ev_sgemm);
394
+ clReleaseEvent(ev_c);
395
+ if (btype == GGML_TYPE_Q5_0) {
396
+ free((void*) cl_host_b);
397
+ }
398
+ }
@@ -0,0 +1,24 @@
1
+ #pragma once
2
+
3
+ #ifdef __cplusplus
4
+ extern "C" {
5
+ #endif
6
+
7
+ void ggml_cl_init(void);
8
+
9
+ enum ggml_blas_order {
10
+ GGML_BLAS_ORDER_ROW_MAJOR = 101,
11
+ GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
12
+ };
13
+
14
+ enum ggml_blas_op {
15
+ GGML_BLAS_OP_N = 111,
16
+ GGML_BLAS_OP_T = 112,
17
+ GGML_BLAS_OP_C = 113,
18
+ };
19
+
20
+ void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
21
+
22
+ #ifdef __cplusplus
23
+ }
24
+ #endif