llama_cpp 0.3.0 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +195 -2
- data/ext/llama_cpp/src/ggml-cuda.cu +499 -118
- data/ext/llama_cpp/src/ggml-cuda.h +1 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +357 -176
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +230 -261
- data/ext/llama_cpp/src/llama.h +31 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +21 -1
- metadata +3 -2
@@ -21,11 +21,19 @@
|
|
21
21
|
|
22
22
|
#define CL_DMMV_BLOCK_SIZE 32
|
23
23
|
|
24
|
+
#ifndef K_QUANTS_PER_ITERATION
|
25
|
+
#define K_QUANTS_PER_ITERATION 1
|
26
|
+
#else
|
27
|
+
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
28
|
+
#endif
|
29
|
+
|
24
30
|
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
25
31
|
static std::string program_source = MULTILINE_QUOTE(
|
26
32
|
|
27
33
|
typedef char int8_t;
|
28
34
|
typedef uchar uint8_t;
|
35
|
+
typedef short int16_t;
|
36
|
+
typedef ushort uint16_t;
|
29
37
|
typedef int int32_t;
|
30
38
|
typedef uint uint32_t;
|
31
39
|
|
@@ -175,7 +183,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float
|
|
175
183
|
*v0 = vload_half(0, &x[ib + 0]);
|
176
184
|
*v1 = vload_half(0, &x[ib + 1]);
|
177
185
|
}
|
186
|
+
);
|
178
187
|
|
188
|
+
static std::string k_quants_source = MULTILINE_QUOTE(
|
179
189
|
inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
|
180
190
|
{
|
181
191
|
if (j < 4)
|
@@ -199,7 +209,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
|
|
199
209
|
const int is = 8 * n + l / 16;
|
200
210
|
|
201
211
|
const uint8_t q = x[i].qs[32 * n + l];
|
202
|
-
__global float *y = yy + i *
|
212
|
+
__global float *y = yy + i * QK_K + 128 * n;
|
203
213
|
|
204
214
|
const float dall = vload_half(0, &x[i].d);
|
205
215
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -231,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
|
231
241
|
float d_all = vload_half(0, &x[i].d);
|
232
242
|
float dl = d_all * (us - 32);
|
233
243
|
|
234
|
-
__global float *y = yy + i *
|
244
|
+
__global float *y = yy + i * QK_K + 128 * n + 32 * j;
|
235
245
|
const __global uint8_t *q = x[i].qs + 32 * n;
|
236
246
|
const __global uint8_t *hm = x[i].hmask;
|
237
247
|
|
@@ -248,7 +258,7 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
|
|
248
258
|
const int is = 2 * il;
|
249
259
|
const int n = 4;
|
250
260
|
|
251
|
-
__global float *y = yy + i *
|
261
|
+
__global float *y = yy + i * QK_K + 64 * il + n * ir;
|
252
262
|
|
253
263
|
const float dall = vload_half(0, &x[i].d);
|
254
264
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -277,7 +287,7 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
|
|
277
287
|
const int ir = tid % 16;
|
278
288
|
const int is = 2 * il;
|
279
289
|
|
280
|
-
__global float *y = yy + i *
|
290
|
+
__global float *y = yy + i * QK_K + 64 * il + 2 * ir;
|
281
291
|
|
282
292
|
const float dall = vload_half(0, &x[i].d);
|
283
293
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -309,7 +319,7 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
|
|
309
319
|
const int il = tid - 32 * ip;
|
310
320
|
const int is = 8 * ip + il / 16;
|
311
321
|
|
312
|
-
__global float *y = yy + i *
|
322
|
+
__global float *y = yy + i * QK_K + 128 * ip + il;
|
313
323
|
|
314
324
|
const float d = vload_half(0, &x[i].d);
|
315
325
|
|
@@ -323,161 +333,387 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
|
|
323
333
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
324
334
|
}
|
325
335
|
|
336
|
+
__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
|
326
337
|
|
327
|
-
|
338
|
+
const int row = get_group_id(0);
|
328
339
|
|
329
|
-
int
|
330
|
-
int
|
331
|
-
int l = r / 8;
|
340
|
+
const int num_blocks_per_row = ncols / QK_K;
|
341
|
+
const int ib0 = row*num_blocks_per_row;
|
332
342
|
|
333
|
-
__global const
|
334
|
-
__global const uint8_t *q = x[ib].qs + 32 * n + l;
|
335
|
-
__global const uint8_t *s = x[ib].scales + 8 * n;
|
343
|
+
__global const struct block_q2_K * x = xx + ib0;
|
336
344
|
|
337
|
-
const
|
338
|
-
const
|
345
|
+
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
346
|
+
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
339
347
|
|
340
|
-
|
341
|
-
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
|
342
|
-
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
|
343
|
-
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
|
344
|
-
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
|
345
|
-
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
|
346
|
-
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
|
347
|
-
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
|
348
|
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
348
349
|
|
349
|
-
|
350
|
-
|
350
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
351
|
+
const int in = tid - step*im; // 0...15 or 0...7
|
351
352
|
|
352
|
-
|
353
|
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
354
|
+
const int q_offset = 32*im + l0;
|
355
|
+
const int s_offset = 8*im;
|
356
|
+
const int y_offset = 128*im + l0;
|
353
357
|
|
354
|
-
|
355
|
-
const uint32_t kmask2 = 0x0f0f0f0f;
|
358
|
+
tmp[16 * ix + tid] = 0;
|
356
359
|
|
357
|
-
uint32_t aux[
|
358
|
-
|
360
|
+
uint32_t aux[4];
|
361
|
+
const uint8_t * d = (const uint8_t *)aux;
|
362
|
+
const uint8_t * m = (const uint8_t *)(aux + 2);
|
359
363
|
|
360
|
-
int
|
361
|
-
int r = iqs - 128*n;
|
362
|
-
int l = r/8;
|
364
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
363
365
|
|
364
|
-
|
365
|
-
|
366
|
-
__global const uint8_t * hm = x[ib].hmask + l;
|
367
|
-
const int8_t * s = (const int8_t *)utmp + 8*n;
|
366
|
+
__global const float * y = yy + i * QK_K + y_offset;
|
367
|
+
__global const uint8_t * q = x[i].qs + q_offset;
|
368
368
|
|
369
|
-
|
370
|
-
|
371
|
-
aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24;
|
369
|
+
const float dall = vload_half(0, &x[i].d);
|
370
|
+
const float dmin = vload_half(0, &x[i].dmin);
|
372
371
|
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
372
|
+
__global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
|
373
|
+
aux[0] = a[0] & 0x0f0f0f0f;
|
374
|
+
aux[1] = a[1] & 0x0f0f0f0f;
|
375
|
+
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
|
376
|
+
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
|
377
377
|
|
378
|
-
|
379
|
-
|
378
|
+
float sum1 = 0, sum2 = 0;
|
379
|
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
380
|
+
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
|
381
|
+
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
|
382
|
+
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
|
383
|
+
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
|
384
|
+
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
|
385
|
+
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
|
386
|
+
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
|
387
|
+
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
|
388
|
+
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
|
389
|
+
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
|
380
390
|
|
381
|
-
|
382
|
-
|
383
|
-
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
|
384
|
-
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
|
385
|
-
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
|
386
|
-
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
|
387
|
-
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
|
388
|
-
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
|
391
|
+
}
|
392
|
+
tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
|
389
393
|
|
390
|
-
|
394
|
+
}
|
391
395
|
|
396
|
+
// sum up partial sums and write back result
|
397
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
398
|
+
for (int s=16; s>0; s>>=1) {
|
399
|
+
if (tid < s) {
|
400
|
+
tmp[tid] += tmp[tid + s];
|
401
|
+
}
|
402
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
403
|
+
}
|
404
|
+
if (tid == 0) {
|
405
|
+
dst[row] = tmp[0];
|
406
|
+
}
|
392
407
|
}
|
393
408
|
|
394
|
-
void
|
409
|
+
__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
|
410
|
+
const uint16_t kmask1 = 0x0303;
|
411
|
+
const uint16_t kmask2 = 0x0f0f;
|
395
412
|
|
396
|
-
const int
|
397
|
-
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
398
|
-
const int is = 2*j; // is is in 0...6 in steps of 2
|
413
|
+
const int row = get_group_id(0);
|
399
414
|
|
400
|
-
|
401
|
-
|
415
|
+
const int num_blocks_per_row = ncols / QK_K;
|
416
|
+
const int ib0 = row*num_blocks_per_row;
|
402
417
|
|
403
|
-
const
|
404
|
-
const float dmin = vload_half(0, &x[ib].dmin);
|
418
|
+
__global const struct block_q3_K * x = xx + ib0;
|
405
419
|
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
const
|
410
|
-
|
411
|
-
const
|
412
|
-
const
|
420
|
+
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
421
|
+
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
422
|
+
|
423
|
+
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
424
|
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
425
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
426
|
+
const int in = tid - step*im; // 0....15 or 0...7
|
427
|
+
|
428
|
+
const uint8_t m = 1 << (4*im);
|
429
|
+
|
430
|
+
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
431
|
+
const int q_offset = 32*im + l0;
|
432
|
+
const int y_offset = 128*im + l0;
|
433
|
+
|
434
|
+
uint16_t utmp[4];
|
435
|
+
const int8_t * s = (const int8_t *)utmp;
|
436
|
+
|
437
|
+
const uint16_t s_shift = 4*im;
|
438
|
+
|
439
|
+
tmp[16 * ix + tid] = 0;
|
440
|
+
|
441
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
442
|
+
|
443
|
+
__global const float * y = yy + i * QK_K + y_offset;
|
444
|
+
__global const uint8_t * q = x[i].qs + q_offset;
|
445
|
+
__global const uint8_t * h = x[i].hmask + l0;
|
446
|
+
|
447
|
+
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
|
448
|
+
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
|
449
|
+
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
|
450
|
+
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
|
451
|
+
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
|
452
|
+
|
453
|
+
const float d = vload_half(0, &x[i].d);
|
454
|
+
|
455
|
+
float sum = 0;
|
456
|
+
for (int l = 0; l < n; ++l) {
|
457
|
+
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
|
458
|
+
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
|
459
|
+
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
|
460
|
+
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
|
461
|
+
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
|
462
|
+
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
|
463
|
+
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
|
464
|
+
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
|
465
|
+
}
|
466
|
+
tmp[16 * ix + tid] += d * sum;
|
413
467
|
|
414
|
-
float sum = 0;
|
415
|
-
for (int k = 0; k < 4; ++k) {
|
416
|
-
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
|
417
|
-
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
|
418
468
|
}
|
419
469
|
|
420
|
-
|
470
|
+
// sum up partial sums and write back result
|
471
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
472
|
+
for (int s=16; s>0; s>>=1) {
|
473
|
+
if (tid < s) {
|
474
|
+
tmp[tid] += tmp[tid + s];
|
475
|
+
}
|
476
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
477
|
+
}
|
478
|
+
if (tid == 0) {
|
479
|
+
dst[row] = tmp[0];
|
480
|
+
}
|
421
481
|
}
|
422
482
|
|
423
|
-
void
|
483
|
+
__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
|
424
484
|
|
425
|
-
|
426
|
-
const
|
427
|
-
const
|
485
|
+
//to rename it later, just to test now
|
486
|
+
const uint16_t kmask1 = 0x3f3f;
|
487
|
+
const uint16_t kmask2 = 0x0f0f;
|
488
|
+
const uint16_t kmask3 = 0xc0c0;
|
428
489
|
|
429
|
-
|
430
|
-
|
431
|
-
|
490
|
+
const int row = get_group_id(0);
|
491
|
+
const int num_blocks_per_row = ncols / QK_K;
|
492
|
+
const int ib0 = row*num_blocks_per_row;
|
432
493
|
|
433
|
-
const
|
434
|
-
const
|
494
|
+
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
|
495
|
+
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
|
435
496
|
|
436
|
-
|
437
|
-
|
438
|
-
const
|
439
|
-
const
|
440
|
-
|
441
|
-
|
442
|
-
const
|
497
|
+
const int step = 8/K_QUANTS_PER_ITERATION;
|
498
|
+
|
499
|
+
const int il = tid/step; // 0...3
|
500
|
+
const int ir = tid - step*il;// 0...3
|
501
|
+
const int n = 2*K_QUANTS_PER_ITERATION;
|
502
|
+
|
503
|
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
504
|
+
const int in = il%2;
|
505
|
+
|
506
|
+
const int l0 = n*(2*ir + in);
|
507
|
+
const int q_offset = 32*im + l0;
|
508
|
+
const int y_offset = 64*im + l0;
|
509
|
+
|
510
|
+
uint16_t aux[4];
|
511
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
512
|
+
|
513
|
+
__global const struct block_q4_K * x = xx + ib0;
|
514
|
+
|
515
|
+
tmp[16 * ix + tid] = 0;
|
516
|
+
|
517
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
518
|
+
|
519
|
+
__global const uint8_t * q1 = x[i].qs + q_offset;
|
520
|
+
__global const uint8_t * q2 = q1 + 64;
|
521
|
+
__global const float * y1 = yy + i*QK_K + y_offset;
|
522
|
+
__global const float * y2 = y1 + 128;
|
523
|
+
|
524
|
+
const float dall = vload_half(0, &x[i].d);
|
525
|
+
const float dmin = vload_half(0, &x[i].dmin);
|
526
|
+
|
527
|
+
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
|
528
|
+
aux[0] = a[im+0] & kmask1;
|
529
|
+
aux[1] = a[im+2] & kmask1;
|
530
|
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
531
|
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
532
|
+
|
533
|
+
float4 s = (float4)(0.f);
|
534
|
+
float smin = 0;
|
535
|
+
for (int l = 0; l < n; ++l) {
|
536
|
+
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
|
537
|
+
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
|
538
|
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
539
|
+
}
|
540
|
+
tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
443
541
|
|
444
|
-
uint8_t hm = 1 << is;
|
445
|
-
float sum = 0;
|
446
|
-
for (int k = 0; k < 4; ++k) {
|
447
|
-
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
|
448
542
|
}
|
449
|
-
|
450
|
-
|
451
|
-
|
543
|
+
|
544
|
+
// sum up partial sums and write back result
|
545
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
546
|
+
for (int s=16; s>0; s>>=1) {
|
547
|
+
if (tid < s) {
|
548
|
+
tmp[tid] += tmp[tid + s];
|
549
|
+
}
|
550
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
551
|
+
}
|
552
|
+
if (tid == 0) {
|
553
|
+
dst[row] = tmp[0];
|
554
|
+
}
|
555
|
+
}
|
556
|
+
|
557
|
+
__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
|
558
|
+
|
559
|
+
const uint16_t kmask1 = 0x3f3f;
|
560
|
+
const uint16_t kmask2 = 0x0f0f;
|
561
|
+
const uint16_t kmask3 = 0xc0c0;
|
562
|
+
|
563
|
+
const int row = get_group_id(0);
|
564
|
+
const int num_blocks_per_row = ncols / QK_K;
|
565
|
+
const int ib0 = row*num_blocks_per_row;
|
566
|
+
|
567
|
+
const int tid = get_local_id(0)/2; // 0...15
|
568
|
+
const int ix = get_local_id(0)%2;
|
569
|
+
|
570
|
+
const int il = tid/4; // 0...3
|
571
|
+
const int ir = tid - 4*il;// 0...3
|
572
|
+
const int n = 2;
|
573
|
+
|
574
|
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
575
|
+
const int in = il%2;
|
576
|
+
|
577
|
+
const int l0 = n*(2*ir + in);
|
578
|
+
const int q_offset = 32*im + l0;
|
579
|
+
const int y_offset = 64*im + l0;
|
580
|
+
|
581
|
+
const uint8_t hm1 = 1 << (2*im);
|
582
|
+
const uint8_t hm2 = hm1 << 4;
|
583
|
+
|
584
|
+
uint16_t aux[4];
|
585
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
586
|
+
|
587
|
+
__global const struct block_q5_K * x = xx + ib0;
|
588
|
+
|
589
|
+
tmp[16 * ix + tid] = 0;
|
590
|
+
|
591
|
+
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
592
|
+
|
593
|
+
__global const uint8_t * ql1 = x[i].qs + q_offset;
|
594
|
+
__global const uint8_t * ql2 = ql1 + 64;
|
595
|
+
__global const uint8_t * qh = x[i].qh + l0;
|
596
|
+
__global const float * y1 = yy + i*QK_K + y_offset;
|
597
|
+
__global const float * y2 = y1 + 128;
|
598
|
+
|
599
|
+
const float dall = vload_half(0, &x[i].d);
|
600
|
+
const float dmin = vload_half(0, &x[i].dmin);
|
601
|
+
|
602
|
+
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
|
603
|
+
aux[0] = a[im+0] & kmask1;
|
604
|
+
aux[1] = a[im+2] & kmask1;
|
605
|
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
606
|
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
607
|
+
|
608
|
+
float4 sum = (float4)(0.f);
|
609
|
+
float smin = 0;
|
610
|
+
for (int l = 0; l < n; ++l) {
|
611
|
+
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
612
|
+
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
613
|
+
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
614
|
+
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
615
|
+
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
616
|
+
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
617
|
+
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
618
|
+
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
619
|
+
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
620
|
+
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
621
|
+
}
|
622
|
+
tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
623
|
+
|
452
624
|
}
|
453
|
-
*result = sum;
|
454
625
|
|
626
|
+
// sum up partial sums and write back result
|
627
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
628
|
+
for (int s=16; s>0; s>>=1) {
|
629
|
+
if (tid < s) {
|
630
|
+
tmp[tid] += tmp[tid + s];
|
631
|
+
}
|
632
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
633
|
+
}
|
634
|
+
if (tid == 0) {
|
635
|
+
dst[row] = tmp[0];
|
636
|
+
}
|
455
637
|
}
|
456
638
|
|
457
|
-
void
|
639
|
+
__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
|
458
640
|
|
641
|
+
const int row = get_group_id(0);
|
459
642
|
|
460
|
-
const int
|
461
|
-
const int
|
462
|
-
const int is = 8*ip;
|
643
|
+
const int num_blocks_per_row = ncols / QK_K;
|
644
|
+
const int ib0 = row*num_blocks_per_row;
|
463
645
|
|
464
|
-
__global const
|
646
|
+
__global const struct block_q6_K * x = xx + ib0;
|
465
647
|
|
466
|
-
const
|
648
|
+
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
649
|
+
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
650
|
+
|
651
|
+
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
652
|
+
|
653
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
654
|
+
const int in = tid - step*im; // 0...15 or 0...7
|
655
|
+
|
656
|
+
\n#if K_QUANTS_PER_ITERATION == 1\n
|
657
|
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
658
|
+
const int is = 0;
|
659
|
+
|
660
|
+
\n#else\n
|
467
661
|
|
468
|
-
|
469
|
-
|
470
|
-
__global const int8_t * sc = x[ib].scales + is;
|
662
|
+
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
663
|
+
const int is = in / 4;
|
471
664
|
|
472
|
-
|
473
|
-
+ y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
|
474
|
-
+ y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
|
475
|
-
+ y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
|
476
|
-
+ y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
|
477
|
-
+ y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
|
478
|
-
+ y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
|
479
|
-
+ y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
|
665
|
+
\n#endif\n
|
480
666
|
|
667
|
+
const int ql_offset = 64*im + l0;
|
668
|
+
const int qh_offset = 32*im + l0;
|
669
|
+
const int s_offset = 8*im + is;
|
670
|
+
const int y_offset = 128*im + l0;
|
671
|
+
|
672
|
+
tmp[16 * ix + tid] = 0; // partial sum for thread in warp
|
673
|
+
|
674
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
675
|
+
|
676
|
+
__global const float * y = yy + i * QK_K + y_offset;
|
677
|
+
__global const uint8_t * ql = x[i].ql + ql_offset;
|
678
|
+
__global const uint8_t * qh = x[i].qh + qh_offset;
|
679
|
+
__global const int8_t * s = x[i].scales + s_offset;
|
680
|
+
|
681
|
+
const float d = vload_half(0, &x[i].d);
|
682
|
+
|
683
|
+
\n#if K_QUANTS_PER_ITERATION == 1\n
|
684
|
+
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
685
|
+
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
686
|
+
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
687
|
+
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
688
|
+
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
689
|
+
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
690
|
+
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
691
|
+
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
692
|
+
tmp[16 * ix + tid] += sum;
|
693
|
+
\n#else\n
|
694
|
+
float sum = 0;
|
695
|
+
for (int l = 0; l < 4; ++l) {
|
696
|
+
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
697
|
+
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
|
698
|
+
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
|
699
|
+
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
700
|
+
}
|
701
|
+
tmp[16 * ix + tid] += sum;
|
702
|
+
\n#endif\n
|
703
|
+
|
704
|
+
}
|
705
|
+
|
706
|
+
// sum up partial sums and write back result
|
707
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
708
|
+
for (int s=16; s>0; s>>=1) {
|
709
|
+
if (tid < s) {
|
710
|
+
tmp[tid] += tmp[tid + s];
|
711
|
+
}
|
712
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
713
|
+
}
|
714
|
+
if (tid == 0) {
|
715
|
+
dst[row] = tmp[0];
|
716
|
+
}
|
481
717
|
}
|
482
718
|
|
483
719
|
);
|
@@ -549,44 +785,6 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
549
785
|
}
|
550
786
|
);
|
551
787
|
|
552
|
-
std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE(
|
553
|
-
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
554
|
-
const int block_size = get_local_size(0);
|
555
|
-
const int row = get_group_id(0);
|
556
|
-
const int tid = get_local_id(0);
|
557
|
-
|
558
|
-
const int iter_stride = 256;
|
559
|
-
const int vals_per_iter = iter_stride / block_size;
|
560
|
-
const int num_blocks_per_row = ncols / 256;
|
561
|
-
const int ib0 = row*num_blocks_per_row;
|
562
|
-
|
563
|
-
tmp[tid] = 0;
|
564
|
-
|
565
|
-
for (int i = 0; i < ncols; i += iter_stride) {
|
566
|
-
const int col = i + vals_per_iter*tid;
|
567
|
-
const int ib = ib0 + col/256; // x block index
|
568
|
-
const int iqs = col%256; // x quant index
|
569
|
-
const int iybs = col - col%256; // y block start index
|
570
|
-
|
571
|
-
// dequantize
|
572
|
-
float v;
|
573
|
-
DOT_KERNEL(x, ib, iqs, y + iybs, &v);
|
574
|
-
tmp[tid] += v;
|
575
|
-
}
|
576
|
-
|
577
|
-
// sum up partial sums and write back result
|
578
|
-
barrier(CLK_LOCAL_MEM_FENCE);
|
579
|
-
for (int s=block_size/2; s>0; s>>=1) {
|
580
|
-
if (tid < s) {
|
581
|
-
tmp[tid] += tmp[tid + s];
|
582
|
-
}
|
583
|
-
barrier(CLK_LOCAL_MEM_FENCE);
|
584
|
-
}
|
585
|
-
if (tid == 0) {
|
586
|
-
dst[row] = tmp[0];
|
587
|
-
}
|
588
|
-
}
|
589
|
-
);
|
590
788
|
|
591
789
|
std::string mul_template = MULTILINE_QUOTE(
|
592
790
|
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
|
@@ -649,18 +847,6 @@ std::array<std::string, 2> mul_str_values = {
|
|
649
847
|
"mul_f32", "float"
|
650
848
|
};
|
651
849
|
|
652
|
-
std::array<std::string, 3> dmmv_k_str_keys = {
|
653
|
-
"KERNEL_NAME", "X_TYPE", "DOT_KERNEL"
|
654
|
-
};
|
655
|
-
|
656
|
-
std::array<std::string, 15> dmmv_k_str_values = {
|
657
|
-
"dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K",
|
658
|
-
"dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K",
|
659
|
-
"dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K",
|
660
|
-
"dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K",
|
661
|
-
"dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K",
|
662
|
-
};
|
663
|
-
|
664
850
|
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
665
851
|
size_t pos = 0;
|
666
852
|
while ((pos = s.find(from, pos)) != std::string::npos) {
|
@@ -673,6 +859,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
|
|
673
859
|
std::string generate_kernels() {
|
674
860
|
std::stringstream src;
|
675
861
|
src << program_source << '\n';
|
862
|
+
src << k_quants_source << '\n';
|
676
863
|
for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
|
677
864
|
std::string dequant_kernel = dequant_template;
|
678
865
|
std::string dmmv_kernel = dequant_mul_mat_vec_template;
|
@@ -690,13 +877,6 @@ std::string generate_kernels() {
|
|
690
877
|
}
|
691
878
|
src << mul_kernel << '\n';
|
692
879
|
}
|
693
|
-
for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) {
|
694
|
-
std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template;
|
695
|
-
for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) {
|
696
|
-
replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]);
|
697
|
-
}
|
698
|
-
src << dmmv_k_kernel << '\n';
|
699
|
-
}
|
700
880
|
|
701
881
|
return src.str();
|
702
882
|
}
|
@@ -729,10 +909,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
|
729
909
|
exit(1);
|
730
910
|
}
|
731
911
|
|
732
|
-
|
733
|
-
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1"
|
912
|
+
std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
|
913
|
+
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
|
914
|
+
"-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
|
734
915
|
|
735
|
-
err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL);
|
916
|
+
err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
|
736
917
|
if(err < 0) {
|
737
918
|
|
738
919
|
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
@@ -1199,7 +1380,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1199
1380
|
const int64_t ne00 = src0->ne[0];
|
1200
1381
|
const int64_t ne01 = src0->ne[1];
|
1201
1382
|
const int64_t ne02 = src0->ne[2];
|
1202
|
-
const int64_t ne03 = src0->ne[
|
1383
|
+
const int64_t ne03 = src0->ne[3];
|
1203
1384
|
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
1204
1385
|
const int64_t ne10 = src1->ne[0];
|
1205
1386
|
const int64_t ne11 = src1->ne[1];
|