llama_cpp 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,361 +0,0 @@
1
- #include "ggml-opencl.h"
2
-
3
- #define CL_TARGET_OPENCL_VERSION 110
4
- #include <clblast_c.h>
5
-
6
- #include <stdlib.h>
7
- #include <stdio.h>
8
- #include <string.h>
9
-
10
- #include "ggml.h"
11
-
12
- #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
- const char * clblast_dequant = MULTILINE_QUOTE(
14
-
15
- typedef uchar uint8_t;
16
- typedef int int32_t;
17
- typedef uint uint32_t;
18
-
19
- constant uint QK4_0 = 32;
20
- struct block_q4_0
21
- {
22
- float d;
23
- uint8_t qs[QK4_0 / 2];
24
- };
25
-
26
- constant uint QK4_1 = 32;
27
- struct block_q4_1
28
- {
29
- float d;
30
- float m;
31
- uint8_t qs[QK4_1 / 2];
32
- };
33
-
34
- constant uint QK5_0 = 32;
35
- struct __attribute__ ((packed)) block_q5_0
36
- {
37
- half d;
38
- uint32_t qh;
39
- uint8_t qs[QK5_0 / 2];
40
- };
41
-
42
- constant uint QK5_1 = 32;
43
- struct block_q5_1
44
- {
45
- half d;
46
- half m;
47
- uint32_t qh;
48
- uint8_t qs[QK5_1 / 2];
49
- };
50
-
51
- constant uint QK8_0 = 32;
52
- struct block_q8_0
53
- {
54
- float d;
55
- uint8_t qs[QK8_0];
56
- };
57
-
58
-
59
- __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
60
- constant uint qk = QK4_0;
61
-
62
- const uint i = get_global_id(0) / qk;
63
- const uint j = get_local_id(0);
64
-
65
- const float d = x[i].d;
66
-
67
- const int x0 = (x[i].qs[j] & 0xf) - 8;
68
- const int x1 = (x[i].qs[j] >> 4) - 8;
69
-
70
- y[i*qk + j + 0 ] = x0*d;
71
- y[i*qk + j + qk/2] = x1*d;
72
- }
73
-
74
- __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
75
- constant uint qk = QK4_1;
76
-
77
- const uint i = get_global_id(0) / qk;
78
- const uint j = get_local_id(0);
79
-
80
- const float d = x[i].d;
81
- const float m = x[i].m;
82
-
83
- const int x0 = (x[i].qs[j] & 0xf);
84
- const int x1 = (x[i].qs[j] >> 4);
85
-
86
- y[i*qk + j + 0 ] = x0*d + m;
87
- y[i*qk + j + qk/2] = x1*d + m;
88
- }
89
-
90
- __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
91
- constant uint qk = QK5_0;
92
-
93
- const uint i = get_global_id(0) / qk;
94
- const uint j = get_local_id(0);
95
-
96
- const float d = vload_half(0, (__global half*) &x[i].d);
97
-
98
- uint32_t qh = x[i].qh;
99
-
100
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
101
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
102
-
103
- const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
104
- const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
105
-
106
- y[i*qk + j + 0 ] = x0*d;
107
- y[i*qk + j + qk/2] = x1*d;
108
- }
109
-
110
- __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
111
- constant uint qk = QK5_1;
112
-
113
- const uint i = get_global_id(0) / qk;
114
- const uint j = get_local_id(0);
115
-
116
- const float d = vload_half(0, (__global half*) &x[i].d);
117
- const float m = vload_half(0, (__global half*) &x[i].m);
118
-
119
- uint32_t qh = x[i].qh;
120
-
121
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
122
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
123
-
124
- const int x0 = (x[i].qs[j] & 0xf) | xh_0;
125
- const int x1 = (x[i].qs[j] >> 4) | xh_1;
126
-
127
- y[i*qk + j + 0 ] = x0*d + m;
128
- y[i*qk + j + qk/2] = x1*d + m;
129
- }
130
-
131
- __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
132
- constant uint qk = QK8_0;
133
- const uint i = get_global_id(0) / qk;
134
- const uint j = get_local_id(0);
135
-
136
- const float d = x[i].d;
137
- y[i*qk + j] = x[i].qs[j]*d;
138
- }
139
-
140
- );
141
-
142
- #define CL_CHECK(err, name) \
143
- do { \
144
- cl_int err_ = (err); \
145
- if (err_ != CL_SUCCESS) { \
146
- fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
147
- exit(1); \
148
- } \
149
- } while (0)
150
-
151
- static cl_platform_id platform;
152
- static cl_device_id device;
153
- static cl_context context;
154
- static cl_command_queue queue;
155
- static cl_program program;
156
- static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
157
- static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
158
- static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
159
-
160
- static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
161
- cl_program p;
162
- char *program_log;
163
- size_t program_size, log_size;
164
- int err;
165
-
166
- program_size = strlen(program_buffer);
167
-
168
- p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
169
- if(err < 0) {
170
- fprintf(stderr, "OpenCL error creating program");
171
- exit(1);
172
- }
173
-
174
- err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
175
- if(err < 0) {
176
-
177
- clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
178
- program_log = (char*) malloc(log_size + 1);
179
- program_log[log_size] = '\0';
180
- clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
181
- printf("%s\n", program_log);
182
- free(program_log);
183
- exit(1);
184
- }
185
-
186
- return p;
187
- }
188
-
189
- void ggml_cl_init(void) {
190
- cl_int err = 0;
191
- char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
192
- char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
193
- int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
194
- int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
195
- printf("\nInitializing CLBlast (First Run)...");
196
- printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
197
- cl_uint num_platforms;
198
- clGetPlatformIDs(0, NULL, &num_platforms);
199
- cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
200
- clGetPlatformIDs(num_platforms, platforms, NULL);
201
- platform = platforms[plat_num];
202
- char platform_buffer[1024];
203
- clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
204
- cl_uint num_devices;
205
- clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
206
- cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
207
- clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
208
- device = devices[dev_num];
209
- char device_buffer[1024];
210
- clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
211
- printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
212
- context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
213
- CL_CHECK(err, "clCreateContext");
214
- queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
215
- CL_CHECK(err, "clCreateCommandQueue");
216
-
217
- free(platforms);
218
- free(devices);
219
-
220
- program = build_program_from_source(context, device, clblast_dequant);
221
-
222
- // Prepare dequantize kernels
223
- kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
224
- CL_CHECK(err, "clCreateKernel");
225
- kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
226
- CL_CHECK(err, "clCreateKernel");
227
- kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
228
- CL_CHECK(err, "clCreateKernel");
229
- kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
230
- CL_CHECK(err, "clCreateKernel");
231
- kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
232
- CL_CHECK(err, "clCreateKernel");
233
- }
234
-
235
- static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
236
- if (req_size <= *cur_size) {
237
- return;
238
- }
239
-
240
- // Reallocate buffer with enough space
241
- if (*cur_size > 0) {
242
- clReleaseMemObject(*buf);
243
- }
244
- cl_int err;
245
- *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
246
- *cur_size = req_size;
247
- CL_CHECK(err, "clCreateBuffer");
248
- }
249
-
250
- void ggml_cl_sgemm_wrapper(
251
- const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
252
- const int m, const int n, const int k,
253
- const float alpha, const void *host_a, const int lda,
254
- const float *host_b, const int ldb, const float beta,
255
- float *host_c, const int ldc, const int btype) {
256
- cl_int err = 0;
257
-
258
- cl_kernel kernel;
259
- size_t global = n * k, local, size_qb;
260
- bool dequant;
261
-
262
- switch (btype) {
263
- case GGML_TYPE_F32:
264
- dequant = false;
265
- break;
266
- case GGML_TYPE_Q4_0:
267
- dequant = true;
268
- kernel = kernel_q4_0;
269
- local = 16;
270
- size_qb = global * (sizeof(float) + local) / 32;
271
- break;
272
- case GGML_TYPE_Q4_1:
273
- dequant = true;
274
- kernel = kernel_q4_1;
275
- local = 16;
276
- size_qb = global * (sizeof(float) * 2 + local) / 32;
277
- break;
278
- case GGML_TYPE_Q5_0:
279
- dequant = true;
280
- kernel = kernel_q5_0;
281
- local = 16;
282
- size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
283
- break;
284
- case GGML_TYPE_Q5_1:
285
- dequant = true;
286
- kernel = kernel_q5_1;
287
- local = 16;
288
- size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
289
- break;
290
- case GGML_TYPE_Q8_0:
291
- dequant = true;
292
- kernel = kernel_q8_0;
293
- local = 32;
294
- size_qb = global * (sizeof(float) + local) / 32;
295
- break;
296
- default:
297
- fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
298
- abort();
299
- }
300
-
301
- const size_t size_a = m * k * sizeof(float);
302
- const size_t size_b = n * k * sizeof(float);
303
- const size_t size_c = m * n * sizeof(float);
304
-
305
- // Prepare buffers
306
- ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
307
- if (dequant) {
308
- ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
309
- }
310
- ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
311
- ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
312
-
313
- cl_event ev_a, ev_qb, ev_b;
314
-
315
- if (dequant) {
316
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
317
- err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
318
- CL_CHECK(err, "clSetKernelArg");
319
- err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
320
- CL_CHECK(err, "clEnqueueWriteBuffer qb");
321
- } else {
322
- err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
323
- CL_CHECK(err, "clEnqueueWriteBuffer b");
324
- }
325
-
326
- err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
327
- CL_CHECK(err, "clEnqueueWriteBuffer a");
328
- if (dequant) {
329
- err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
330
- CL_CHECK(err, "clEnqueueNDRangeKernel");
331
- clReleaseEvent(ev_qb);
332
- }
333
- clWaitForEvents(1, &ev_a);
334
- clWaitForEvents(1, &ev_b);
335
- clReleaseEvent(ev_a);
336
- clReleaseEvent(ev_b);
337
-
338
- cl_event ev_sgemm;
339
- CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
340
- (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
341
- m, n, k,
342
- alpha,
343
- cl_buffer_a, 0, lda,
344
- cl_buffer_b, 0, ldb,
345
- beta,
346
- cl_buffer_c, 0, ldc,
347
- &queue, &ev_sgemm);
348
-
349
- if (status != CLBlastSuccess) {
350
- fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
351
- abort();
352
- }
353
-
354
- cl_event ev_c;
355
- clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
356
-
357
- // Wait for completion
358
- clWaitForEvents(1, &ev_c);
359
- clReleaseEvent(ev_sgemm);
360
- clReleaseEvent(ev_c);
361
- }