llama_cpp 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +315 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2271 -414
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +218 -87
- data/ext/llama_cpp/src/ggml-metal.metal +72 -55
- data/ext/llama_cpp/src/ggml.c +754 -996
- data/ext/llama_cpp/src/ggml.h +94 -18
- data/ext/llama_cpp/src/k_quants.c +350 -24
- data/ext/llama_cpp/src/llama.cpp +713 -179
- data/ext/llama_cpp/src/llama.h +61 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +26 -0
- metadata +4 -2
@@ -67,6 +67,17 @@ kernel void kernel_add(
|
|
67
67
|
dst[tpig] = src0[tpig] + src1[tpig];
|
68
68
|
}
|
69
69
|
|
70
|
+
// assumption: src1 is a row
|
71
|
+
// broadcast src1 into src0
|
72
|
+
kernel void kernel_add_row(
|
73
|
+
device const float * src0,
|
74
|
+
device const float * src1,
|
75
|
+
device float * dst,
|
76
|
+
constant int64_t & ne00,
|
77
|
+
uint tpig[[thread_position_in_grid]]) {
|
78
|
+
dst[tpig] = src0[tpig] + src1[tpig % ne00];
|
79
|
+
}
|
80
|
+
|
70
81
|
kernel void kernel_mul(
|
71
82
|
device const float * src0,
|
72
83
|
device const float * src1,
|
@@ -376,87 +387,90 @@ kernel void kernel_rms_norm(
|
|
376
387
|
}
|
377
388
|
}
|
378
389
|
|
379
|
-
// function for calculate inner product between a q4_0 block and
|
380
|
-
|
390
|
+
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
|
391
|
+
// il indicates where the q4 quants begin (0 or QK4_0/4)
|
392
|
+
// we assume that the yl's have been multiplied with the appropriate scale factor
|
393
|
+
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
394
|
+
inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
|
381
395
|
float d = qb_curr->d;
|
382
|
-
|
383
|
-
device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
|
384
|
-
for (int i = 0; i <
|
385
|
-
acc[0] += yl[i]
|
386
|
-
|
387
|
-
acc[
|
388
|
-
|
396
|
+
float2 acc = 0.f;
|
397
|
+
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
|
398
|
+
for (int i = 0; i < 8; i+=2) {
|
399
|
+
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
|
400
|
+
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
401
|
+
acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
|
402
|
+
+ yl[i + 9] * (qs[i / 2] & 0xF000);
|
389
403
|
}
|
390
|
-
return d * (sumy * -8.f + acc[0] + acc[1]
|
404
|
+
return d * (sumy * -8.f + acc[0] + acc[1]);
|
391
405
|
}
|
392
406
|
|
393
|
-
// function for calculate inner product between a q4_1 block and
|
394
|
-
|
407
|
+
// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
|
408
|
+
// il indicates where the q4 quants begin (0 or QK4_0/4)
|
409
|
+
// we assume that the yl's have been multiplied with the appropriate scale factor
|
410
|
+
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
411
|
+
inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
|
395
412
|
float d = qb_curr->d;
|
396
413
|
float m = qb_curr->m;
|
397
|
-
|
398
|
-
|
399
|
-
for (int i = 0; i <
|
400
|
-
acc[0] += yl[i]
|
401
|
-
|
402
|
-
acc[
|
403
|
-
|
414
|
+
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
|
415
|
+
float2 acc = 0.f;
|
416
|
+
for (int i = 0; i < 8; i+=2) {
|
417
|
+
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
|
418
|
+
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
419
|
+
acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
|
420
|
+
+ yl[i + 9] * (qs[i / 2] & 0xF000);
|
404
421
|
}
|
405
|
-
return d * (acc[0] + acc[1]
|
422
|
+
return d * (acc[0] + acc[1]) + sumy * m;
|
406
423
|
}
|
407
424
|
|
408
425
|
// putting them in the kernel cause a significant performance penalty
|
409
426
|
#define N_DST 4 // each SIMD group works on 4 rows
|
410
427
|
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
411
428
|
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
412
|
-
template
|
429
|
+
//Note: This is a template, but strictly speaking it only applies to
|
430
|
+
// quantizations where the block size is 32. It also does not
|
431
|
+
// giard against the number of rows not being divisible by
|
432
|
+
// N_DST, so this is another explicit assumption of the implementation.
|
433
|
+
template<typename block_q_type, int nr, int nsg, int nw>
|
413
434
|
void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
|
414
435
|
int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
|
415
436
|
uint2 tgpig, uint tiisg, uint sgitg) {
|
416
437
|
const int nb = ne00/QK4_0;
|
417
438
|
const int r0 = tgpig.x;
|
418
439
|
const int r1 = tgpig.y;
|
419
|
-
|
440
|
+
const int first_row = (r0 * nsg + sgitg) * nr;
|
441
|
+
device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb;
|
420
442
|
device const float * y = (device const float *) src1 + r1*ne10;
|
421
|
-
|
422
|
-
float sumf[
|
423
|
-
thread float * yl=(thread float *)y_curr;
|
443
|
+
float yl[16]; // src1 vector cache
|
444
|
+
float sumf[nr]={0.f};
|
424
445
|
|
425
|
-
|
426
|
-
|
427
|
-
float sumy = 0;
|
428
|
-
for (int i = 0; i < QK4_0 / 4; i++) {
|
429
|
-
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
|
430
|
-
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
431
|
-
}
|
446
|
+
const int ix = tiisg/2;
|
447
|
+
const int il = 8*(tiisg%2);
|
432
448
|
|
433
|
-
|
434
|
-
sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
|
435
|
-
}
|
436
|
-
}
|
449
|
+
device const float * yb = y + ix * QK4_0 + il;
|
437
450
|
|
438
|
-
//
|
439
|
-
int
|
440
|
-
int ib = tiisg % (N_SIMDWIDTH / 2);
|
441
|
-
for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
|
442
|
-
int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
|
451
|
+
// each thread in a SIMD group deals with half a block.
|
452
|
+
for (int ib = ix; ib < nb; ib += nw/2) {
|
443
453
|
float sumy = 0;
|
444
|
-
for (int i = 0; i <
|
445
|
-
|
446
|
-
|
454
|
+
for (int i = 0; i < 8; i += 2) {
|
455
|
+
sumy += yb[i] + yb[i+1];
|
456
|
+
yl[i+0] = yb[i+ 0];
|
457
|
+
yl[i+1] = yb[i+ 1]/256.f;
|
458
|
+
sumy += yb[i+16] + yb[i+17];
|
459
|
+
yl[i+8] = yb[i+16]/16.f;
|
460
|
+
yl[i+9] = yb[i+17]/4096.f;
|
447
461
|
}
|
448
462
|
|
449
|
-
for (int row = 0; row <
|
450
|
-
|
451
|
-
sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
|
452
|
-
}
|
463
|
+
for (int row = 0; row < nr; row++) {
|
464
|
+
sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
|
453
465
|
}
|
466
|
+
|
467
|
+
yb += QK4_0 * 16;
|
454
468
|
}
|
455
469
|
|
456
|
-
for (int row = 0; row <
|
457
|
-
|
458
|
-
if (tiisg == 0 &&
|
459
|
-
dst[r1*ne0 +
|
470
|
+
for (int row = 0; row < nr; ++row) {
|
471
|
+
const float tot = simd_sum(sumf[row]);
|
472
|
+
if (tiisg == 0 && first_row + row < ne01) {
|
473
|
+
dst[r1*ne0 + first_row + row] = tot;
|
460
474
|
}
|
461
475
|
}
|
462
476
|
}
|
@@ -472,7 +486,7 @@ kernel void kernel_mul_mat_q4_0_f32(
|
|
472
486
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
473
487
|
uint tiisg[[thread_index_in_simdgroup]],
|
474
488
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
475
|
-
mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
489
|
+
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
476
490
|
}
|
477
491
|
|
478
492
|
kernel void kernel_mul_mat_q4_1_f32(
|
@@ -486,7 +500,7 @@ kernel void kernel_mul_mat_q4_1_f32(
|
|
486
500
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
487
501
|
uint tiisg[[thread_index_in_simdgroup]],
|
488
502
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
489
|
-
mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
503
|
+
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
490
504
|
}
|
491
505
|
|
492
506
|
kernel void kernel_mul_mat_f16_f32(
|
@@ -495,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
|
|
495
509
|
device float * dst,
|
496
510
|
constant int64_t & ne00,
|
497
511
|
constant int64_t & ne01,
|
512
|
+
constant int64_t & ne02,
|
498
513
|
constant uint64_t & nb00,
|
499
514
|
constant uint64_t & nb01,
|
500
515
|
constant uint64_t & nb02,
|
501
516
|
constant int64_t & ne10,
|
502
517
|
constant int64_t & ne11,
|
518
|
+
constant int64_t & ne12,
|
503
519
|
constant uint64_t & nb10,
|
504
520
|
constant uint64_t & nb11,
|
505
521
|
constant uint64_t & nb12,
|
@@ -515,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
515
531
|
const int64_t r1 = tgpig.y;
|
516
532
|
const int64_t im = tgpig.z;
|
517
533
|
|
518
|
-
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
|
534
|
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
519
535
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
520
536
|
|
521
537
|
sum[tpitg.x] = 0.0f;
|
@@ -538,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
538
554
|
}
|
539
555
|
}
|
540
556
|
|
557
|
+
|
541
558
|
kernel void kernel_alibi_f32(
|
542
559
|
device const float * src0,
|
543
560
|
device float * dst,
|