llama_cpp 0.0.6 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,18 @@
1
+ #include "ggml.h"
2
+
1
3
  #ifdef __cplusplus
2
4
  extern "C" {
3
5
  #endif
4
6
 
5
- void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
6
- void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
7
- void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
8
- void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
7
+ void ggml_init_cublas(void);
8
+
9
+ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
10
+ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
+ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
12
+
13
+ // TODO: export these with GGML_API
14
+ void * ggml_cuda_host_malloc(size_t size);
15
+ void ggml_cuda_host_free(void * ptr);
9
16
 
10
17
  #ifdef __cplusplus
11
18
  }
@@ -0,0 +1,398 @@
1
+ #include "ggml-opencl.h"
2
+
3
+ #define CL_TARGET_OPENCL_VERSION 110
4
+ #include <clblast_c.h>
5
+
6
+ #include <stdlib.h>
7
+ #include <stdio.h>
8
+ #include <string.h>
9
+
10
+ #include "ggml.h"
11
+
12
+ #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
+ const char * clblast_dequant = MULTILINE_QUOTE(
14
+
15
+ struct block_q4_0
16
+ {
17
+ float d;
18
+ uchar qs[16];
19
+ };
20
+
21
+ __kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
22
+ const uint i = get_global_id(0) / 32;
23
+ const uint l = get_local_id(0);
24
+
25
+ const float d = blocks[i].d;
26
+
27
+ const uchar vi = blocks[i].qs[l];
28
+
29
+ const uint index = i*32 + l*2;
30
+ result[index + 0] = ((vi & 0xf) - 8)*d;
31
+ result[index + 1] = ((vi >> 4) - 8)*d;
32
+ }
33
+
34
+ struct block_q4_1
35
+ {
36
+ float d;
37
+ float m;
38
+ uchar qs[16];
39
+ };
40
+
41
+ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
42
+ const uint i = get_global_id(0) / 32;
43
+ const uint l = get_local_id(0);
44
+
45
+ const float d = blocks[i].d;
46
+ const float m = blocks[i].m;
47
+
48
+ const uchar vi = blocks[i].qs[l];
49
+
50
+ const uint index = i*32 + l*2;
51
+ result[index + 0] = (vi & 0xf) * d + m;
52
+ result[index + 1] = (vi >> 4) * d + m;
53
+ }
54
+
55
+ struct block_q4_2
56
+ {
57
+ ushort d;
58
+ uchar qs[8];
59
+ };
60
+
61
+ __kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
62
+ const uint i = get_global_id(0) / 16;
63
+ const uint l = get_local_id(0);
64
+
65
+ const float d = vload_half(0, (__global half*) &blocks[i].d);
66
+
67
+ const uchar vi = blocks[i].qs[l];
68
+
69
+ const uint index = i*16 + l*2;
70
+ result[index + 0] = ((vi & 0xf) - 8)*d;
71
+ result[index + 1] = ((vi >> 4) - 8)*d;
72
+ }
73
+
74
+
75
+ struct block_q5_0
76
+ {
77
+ float d;
78
+ uint qh;
79
+ uchar qs[16];
80
+ };
81
+
82
+ __kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
83
+ const uint i = get_global_id(0) / 32;
84
+ const uint l = get_local_id(0);
85
+
86
+ const float d = blocks[i].d;
87
+
88
+ const uchar vi = blocks[i].qs[l];
89
+
90
+ const uint l2 = l * 2;
91
+
92
+ const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
93
+ const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
94
+
95
+ const uint index = i*32 + l2;
96
+ result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
97
+ result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
98
+ }
99
+
100
+ struct block_q5_1
101
+ {
102
+ ushort d;
103
+ ushort m;
104
+ uint qh;
105
+ uchar qs[16];
106
+ };
107
+
108
+ __kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
109
+ const uint i = get_global_id(0) / 32;
110
+ const uint l = get_local_id(0);
111
+
112
+ const float d = vload_half(0, (__global half*) &blocks[i].d);
113
+ const float m = vload_half(0, (__global half*) &blocks[i].m);
114
+
115
+ const uchar vi = blocks[i].qs[l];
116
+
117
+ const uint l2 = l * 2;
118
+
119
+ const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
120
+ const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
121
+
122
+ const uint index = i*32 + l2;
123
+ result[index + 0] = ((vi & 0xf) | vh0)*d + m;
124
+ result[index + 1] = ((vi >> 4) | vh1)*d + m;
125
+ }
126
+
127
+ struct block_q8_0
128
+ {
129
+ float d;
130
+ char qs[32];
131
+ };
132
+
133
+ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
134
+ const uint i = get_global_id(0) / 32;
135
+ const uint l = get_local_id(0);
136
+
137
+ result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
138
+ }
139
+
140
+ );
141
+
142
+ #define CL_CHECK(err, name) \
143
+ do { \
144
+ cl_int err_ = (err); \
145
+ if (err_ != CL_SUCCESS) { \
146
+ fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
147
+ exit(1); \
148
+ } \
149
+ } while (0)
150
+
151
+ #define QK5_0 32
152
+ typedef struct {
153
+ ggml_fp16_t d; // delta
154
+ uint8_t qh[4]; // 5-th bit of quants
155
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
156
+ } block_q5_0;
157
+
158
+
159
+ typedef struct {
160
+ float d; // delta
161
+ uint32_t qh; // 5-th bit of quants
162
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
163
+ } cl_block_q5_0;
164
+
165
+ static cl_platform_id platform;
166
+ static cl_device_id device;
167
+ static cl_context context;
168
+ static cl_command_queue queue;
169
+ static cl_program program;
170
+ static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
171
+ static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
172
+ static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
173
+
174
+ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
175
+ cl_program p;
176
+ char *program_log;
177
+ size_t program_size, log_size;
178
+ int err;
179
+
180
+ program_size = strlen(program_buffer);
181
+
182
+ p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
183
+ if(err < 0) {
184
+ fprintf(stderr, "OpenCL error creating program");
185
+ exit(1);
186
+ }
187
+
188
+ err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
189
+ if(err < 0) {
190
+
191
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
192
+ program_log = (char*) malloc(log_size + 1);
193
+ program_log[log_size] = '\0';
194
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
195
+ printf("%s\n", program_log);
196
+ free(program_log);
197
+ exit(1);
198
+ }
199
+
200
+ return p;
201
+ }
202
+
203
+ void ggml_cl_init(void) {
204
+ cl_int err = 0;
205
+ char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
206
+ char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
207
+ int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
208
+ int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
209
+ printf("\nInitializing CLBlast (First Run)...");
210
+ printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
211
+ cl_uint num_platforms;
212
+ clGetPlatformIDs(0, NULL, &num_platforms);
213
+ cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
214
+ clGetPlatformIDs(num_platforms, platforms, NULL);
215
+ platform = platforms[plat_num];
216
+ char platform_buffer[1024];
217
+ clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
218
+ cl_uint num_devices;
219
+ clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
220
+ cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
221
+ clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
222
+ device = devices[dev_num];
223
+ char device_buffer[1024];
224
+ clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
225
+ printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
226
+ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
227
+ CL_CHECK(err, "clCreateContext");
228
+ queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
229
+ CL_CHECK(err, "clCreateCommandQueue");
230
+
231
+ free(platforms);
232
+ free(devices);
233
+
234
+ program = build_program_from_source(context, device, clblast_dequant);
235
+
236
+ // Prepare dequantize kernels
237
+ kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
238
+ CL_CHECK(err, "clCreateKernel");
239
+ kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
240
+ CL_CHECK(err, "clCreateKernel");
241
+ kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
242
+ CL_CHECK(err, "clCreateKernel");
243
+ kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
244
+ CL_CHECK(err, "clCreateKernel");
245
+ kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
246
+ CL_CHECK(err, "clCreateKernel");
247
+ kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
248
+ CL_CHECK(err, "clCreateKernel");
249
+ }
250
+
251
+ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
252
+ if (req_size <= *cur_size) {
253
+ return;
254
+ }
255
+
256
+ // Reallocate buffer with enough space
257
+ if (*cur_size > 0) {
258
+ clReleaseMemObject(*buf);
259
+ }
260
+ cl_int err;
261
+ *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
262
+ *cur_size = req_size;
263
+ CL_CHECK(err, "clCreateBuffer");
264
+ }
265
+
266
+ void ggml_cl_sgemm_wrapper(
267
+ const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
268
+ const int m, const int n, const int k,
269
+ const float alpha, const void *host_a, const int lda,
270
+ const float *host_b, const int ldb, const float beta,
271
+ float *host_c, const int ldc, const int btype) {
272
+ cl_int err = 0;
273
+
274
+ cl_kernel kernel;
275
+ size_t global = n * k, local, size_qb;
276
+ bool dequant;
277
+ cl_block_q5_0* cl_host_b;
278
+
279
+ switch (btype) {
280
+ case GGML_TYPE_F32:
281
+ dequant = false;
282
+ break;
283
+ case GGML_TYPE_Q4_0:
284
+ dequant = true;
285
+ kernel = kernel_q4_0;
286
+ local = 16;
287
+ size_qb = global * (sizeof(float) + local) / 32;
288
+ break;
289
+ case GGML_TYPE_Q4_1:
290
+ dequant = true;
291
+ kernel = kernel_q4_1;
292
+ local = 16;
293
+ size_qb = global * (sizeof(float) * 2 + local) / 32;
294
+ break;
295
+ case GGML_TYPE_Q4_2:
296
+ dequant = true;
297
+ kernel = kernel_q4_2;
298
+ local = 8;
299
+ size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
300
+ break;
301
+ case GGML_TYPE_Q5_0:
302
+ dequant = true;
303
+ kernel = kernel_q5_0;
304
+ local = 16;
305
+ // For some reason OpenCL seems to be incapable of working with structs of size 22.
306
+ // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
307
+ // TODO Find the reason, fix and remove workaround.
308
+ const block_q5_0* b = (const block_q5_0*) host_b;
309
+ cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
310
+ for (size_t i = 0; i < global / 32; i++) {
311
+ cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
312
+ memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
313
+ memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
314
+ }
315
+ host_b = (const float*) cl_host_b;
316
+ size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
317
+ break;
318
+ case GGML_TYPE_Q5_1:
319
+ dequant = true;
320
+ kernel = kernel_q5_1;
321
+ local = 16;
322
+ size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
323
+ break;
324
+ case GGML_TYPE_Q8_0:
325
+ dequant = true;
326
+ kernel = kernel_q8_0;
327
+ local = 32;
328
+ size_qb = global * (sizeof(float) + local) / 32;
329
+ break;
330
+ default:
331
+ fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
332
+ abort();
333
+ }
334
+
335
+ const size_t size_a = m * k * sizeof(float);
336
+ const size_t size_b = n * k * sizeof(float);
337
+ const size_t size_c = m * n * sizeof(float);
338
+
339
+ // Prepare buffers
340
+ ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
341
+ if (dequant) {
342
+ ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
343
+ }
344
+ ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
345
+ ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
346
+
347
+ cl_event ev_a, ev_qb, ev_b;
348
+
349
+ if (dequant) {
350
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
351
+ err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
352
+ CL_CHECK(err, "clSetKernelArg");
353
+ err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
354
+ CL_CHECK(err, "clEnqueueWriteBuffer qb");
355
+ } else {
356
+ err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
357
+ CL_CHECK(err, "clEnqueueWriteBuffer b");
358
+ }
359
+
360
+ err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
361
+ CL_CHECK(err, "clEnqueueWriteBuffer a");
362
+ if (dequant) {
363
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
364
+ CL_CHECK(err, "clEnqueueNDRangeKernel");
365
+ clReleaseEvent(ev_qb);
366
+ }
367
+ clWaitForEvents(1, &ev_a);
368
+ clWaitForEvents(1, &ev_b);
369
+ clReleaseEvent(ev_a);
370
+ clReleaseEvent(ev_b);
371
+
372
+ cl_event ev_sgemm;
373
+ CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
374
+ (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
375
+ m, n, k,
376
+ alpha,
377
+ cl_buffer_a, 0, lda,
378
+ cl_buffer_b, 0, ldb,
379
+ beta,
380
+ cl_buffer_c, 0, ldc,
381
+ &queue, &ev_sgemm);
382
+
383
+ if (status != CLBlastSuccess) {
384
+ fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
385
+ abort();
386
+ }
387
+
388
+ cl_event ev_c;
389
+ clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
390
+
391
+ // Wait for completion
392
+ clWaitForEvents(1, &ev_c);
393
+ clReleaseEvent(ev_sgemm);
394
+ clReleaseEvent(ev_c);
395
+ if (btype == GGML_TYPE_Q5_0) {
396
+ free((void*) cl_host_b);
397
+ }
398
+ }
@@ -0,0 +1,24 @@
1
+ #pragma once
2
+
3
+ #ifdef __cplusplus
4
+ extern "C" {
5
+ #endif
6
+
7
+ void ggml_cl_init(void);
8
+
9
+ enum ggml_blas_order {
10
+ GGML_BLAS_ORDER_ROW_MAJOR = 101,
11
+ GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
12
+ };
13
+
14
+ enum ggml_blas_op {
15
+ GGML_BLAS_OP_N = 111,
16
+ GGML_BLAS_OP_T = 112,
17
+ GGML_BLAS_OP_C = 113,
18
+ };
19
+
20
+ void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
21
+
22
+ #ifdef __cplusplus
23
+ }
24
+ #endif