llama_cpp 0.3.4 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +315 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2271 -414
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +218 -87
- data/ext/llama_cpp/src/ggml-metal.metal +72 -55
- data/ext/llama_cpp/src/ggml.c +754 -996
- data/ext/llama_cpp/src/ggml.h +94 -18
- data/ext/llama_cpp/src/k_quants.c +350 -24
- data/ext/llama_cpp/src/llama.cpp +713 -179
- data/ext/llama_cpp/src/llama.h +61 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +26 -0
- metadata +4 -2
@@ -67,6 +67,17 @@ kernel void kernel_add(
|
|
67
67
|
dst[tpig] = src0[tpig] + src1[tpig];
|
68
68
|
}
|
69
69
|
|
70
|
+
// assumption: src1 is a row
|
71
|
+
// broadcast src1 into src0
|
72
|
+
kernel void kernel_add_row(
|
73
|
+
device const float * src0,
|
74
|
+
device const float * src1,
|
75
|
+
device float * dst,
|
76
|
+
constant int64_t & ne00,
|
77
|
+
uint tpig[[thread_position_in_grid]]) {
|
78
|
+
dst[tpig] = src0[tpig] + src1[tpig % ne00];
|
79
|
+
}
|
80
|
+
|
70
81
|
kernel void kernel_mul(
|
71
82
|
device const float * src0,
|
72
83
|
device const float * src1,
|
@@ -376,87 +387,90 @@ kernel void kernel_rms_norm(
|
|
376
387
|
}
|
377
388
|
}
|
378
389
|
|
379
|
-
// function for calculate inner product between a q4_0 block and
|
380
|
-
|
390
|
+
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
|
391
|
+
// il indicates where the q4 quants begin (0 or QK4_0/4)
|
392
|
+
// we assume that the yl's have been multiplied with the appropriate scale factor
|
393
|
+
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
394
|
+
inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
|
381
395
|
float d = qb_curr->d;
|
382
|
-
|
383
|
-
device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
|
384
|
-
for (int i = 0; i <
|
385
|
-
acc[0] += yl[i]
|
386
|
-
|
387
|
-
acc[
|
388
|
-
|
396
|
+
float2 acc = 0.f;
|
397
|
+
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
|
398
|
+
for (int i = 0; i < 8; i+=2) {
|
399
|
+
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
|
400
|
+
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
401
|
+
acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
|
402
|
+
+ yl[i + 9] * (qs[i / 2] & 0xF000);
|
389
403
|
}
|
390
|
-
return d * (sumy * -8.f + acc[0] + acc[1]
|
404
|
+
return d * (sumy * -8.f + acc[0] + acc[1]);
|
391
405
|
}
|
392
406
|
|
393
|
-
// function for calculate inner product between a q4_1 block and
|
394
|
-
|
407
|
+
// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
|
408
|
+
// il indicates where the q4 quants begin (0 or QK4_0/4)
|
409
|
+
// we assume that the yl's have been multiplied with the appropriate scale factor
|
410
|
+
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
411
|
+
inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
|
395
412
|
float d = qb_curr->d;
|
396
413
|
float m = qb_curr->m;
|
397
|
-
|
398
|
-
|
399
|
-
for (int i = 0; i <
|
400
|
-
acc[0] += yl[i]
|
401
|
-
|
402
|
-
acc[
|
403
|
-
|
414
|
+
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
|
415
|
+
float2 acc = 0.f;
|
416
|
+
for (int i = 0; i < 8; i+=2) {
|
417
|
+
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
|
418
|
+
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
419
|
+
acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
|
420
|
+
+ yl[i + 9] * (qs[i / 2] & 0xF000);
|
404
421
|
}
|
405
|
-
return d * (acc[0] + acc[1]
|
422
|
+
return d * (acc[0] + acc[1]) + sumy * m;
|
406
423
|
}
|
407
424
|
|
408
425
|
// putting them in the kernel cause a significant performance penalty
|
409
426
|
#define N_DST 4 // each SIMD group works on 4 rows
|
410
427
|
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
411
428
|
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
412
|
-
template
|
429
|
+
//Note: This is a template, but strictly speaking it only applies to
|
430
|
+
// quantizations where the block size is 32. It also does not
|
431
|
+
// giard against the number of rows not being divisible by
|
432
|
+
// N_DST, so this is another explicit assumption of the implementation.
|
433
|
+
template<typename block_q_type, int nr, int nsg, int nw>
|
413
434
|
void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
|
414
435
|
int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
|
415
436
|
uint2 tgpig, uint tiisg, uint sgitg) {
|
416
437
|
const int nb = ne00/QK4_0;
|
417
438
|
const int r0 = tgpig.x;
|
418
439
|
const int r1 = tgpig.y;
|
419
|
-
|
440
|
+
const int first_row = (r0 * nsg + sgitg) * nr;
|
441
|
+
device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb;
|
420
442
|
device const float * y = (device const float *) src1 + r1*ne10;
|
421
|
-
|
422
|
-
float sumf[
|
423
|
-
thread float * yl=(thread float *)y_curr;
|
443
|
+
float yl[16]; // src1 vector cache
|
444
|
+
float sumf[nr]={0.f};
|
424
445
|
|
425
|
-
|
426
|
-
|
427
|
-
float sumy = 0;
|
428
|
-
for (int i = 0; i < QK4_0 / 4; i++) {
|
429
|
-
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
|
430
|
-
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
431
|
-
}
|
446
|
+
const int ix = tiisg/2;
|
447
|
+
const int il = 8*(tiisg%2);
|
432
448
|
|
433
|
-
|
434
|
-
sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
|
435
|
-
}
|
436
|
-
}
|
449
|
+
device const float * yb = y + ix * QK4_0 + il;
|
437
450
|
|
438
|
-
//
|
439
|
-
int
|
440
|
-
int ib = tiisg % (N_SIMDWIDTH / 2);
|
441
|
-
for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
|
442
|
-
int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
|
451
|
+
// each thread in a SIMD group deals with half a block.
|
452
|
+
for (int ib = ix; ib < nb; ib += nw/2) {
|
443
453
|
float sumy = 0;
|
444
|
-
for (int i = 0; i <
|
445
|
-
|
446
|
-
|
454
|
+
for (int i = 0; i < 8; i += 2) {
|
455
|
+
sumy += yb[i] + yb[i+1];
|
456
|
+
yl[i+0] = yb[i+ 0];
|
457
|
+
yl[i+1] = yb[i+ 1]/256.f;
|
458
|
+
sumy += yb[i+16] + yb[i+17];
|
459
|
+
yl[i+8] = yb[i+16]/16.f;
|
460
|
+
yl[i+9] = yb[i+17]/4096.f;
|
447
461
|
}
|
448
462
|
|
449
|
-
for (int row = 0; row <
|
450
|
-
|
451
|
-
sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
|
452
|
-
}
|
463
|
+
for (int row = 0; row < nr; row++) {
|
464
|
+
sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
|
453
465
|
}
|
466
|
+
|
467
|
+
yb += QK4_0 * 16;
|
454
468
|
}
|
455
469
|
|
456
|
-
for (int row = 0; row <
|
457
|
-
|
458
|
-
if (tiisg == 0 &&
|
459
|
-
dst[r1*ne0 +
|
470
|
+
for (int row = 0; row < nr; ++row) {
|
471
|
+
const float tot = simd_sum(sumf[row]);
|
472
|
+
if (tiisg == 0 && first_row + row < ne01) {
|
473
|
+
dst[r1*ne0 + first_row + row] = tot;
|
460
474
|
}
|
461
475
|
}
|
462
476
|
}
|
@@ -472,7 +486,7 @@ kernel void kernel_mul_mat_q4_0_f32(
|
|
472
486
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
473
487
|
uint tiisg[[thread_index_in_simdgroup]],
|
474
488
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
475
|
-
mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
489
|
+
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
476
490
|
}
|
477
491
|
|
478
492
|
kernel void kernel_mul_mat_q4_1_f32(
|
@@ -486,7 +500,7 @@ kernel void kernel_mul_mat_q4_1_f32(
|
|
486
500
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
487
501
|
uint tiisg[[thread_index_in_simdgroup]],
|
488
502
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
489
|
-
mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
503
|
+
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
490
504
|
}
|
491
505
|
|
492
506
|
kernel void kernel_mul_mat_f16_f32(
|
@@ -495,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
|
|
495
509
|
device float * dst,
|
496
510
|
constant int64_t & ne00,
|
497
511
|
constant int64_t & ne01,
|
512
|
+
constant int64_t & ne02,
|
498
513
|
constant uint64_t & nb00,
|
499
514
|
constant uint64_t & nb01,
|
500
515
|
constant uint64_t & nb02,
|
501
516
|
constant int64_t & ne10,
|
502
517
|
constant int64_t & ne11,
|
518
|
+
constant int64_t & ne12,
|
503
519
|
constant uint64_t & nb10,
|
504
520
|
constant uint64_t & nb11,
|
505
521
|
constant uint64_t & nb12,
|
@@ -515,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
515
531
|
const int64_t r1 = tgpig.y;
|
516
532
|
const int64_t im = tgpig.z;
|
517
533
|
|
518
|
-
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
|
534
|
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
519
535
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
520
536
|
|
521
537
|
sum[tpitg.x] = 0.0f;
|
@@ -538,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
538
554
|
}
|
539
555
|
}
|
540
556
|
|
557
|
+
|
541
558
|
kernel void kernel_alibi_f32(
|
542
559
|
device const float * src0,
|
543
560
|
device float * dst,
|