llama_cpp 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +319 -52
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +138 -72
- data/ext/llama_cpp/src/llama.h +33 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
@@ -117,7 +117,13 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
117
117
|
|
118
118
|
//================================= k-quants
|
119
119
|
|
120
|
+
#ifdef GGML_QKK_64
|
121
|
+
#define QK_K 64
|
122
|
+
#define K_SCALE_SIZE 4
|
123
|
+
#else
|
120
124
|
#define QK_K 256
|
125
|
+
#define K_SCALE_SIZE 12
|
126
|
+
#endif
|
121
127
|
|
122
128
|
typedef struct {
|
123
129
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
@@ -128,13 +134,25 @@ typedef struct {
|
|
128
134
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
129
135
|
|
130
136
|
typedef struct {
|
131
|
-
uint8_t hmask[QK_K/8];
|
132
|
-
uint8_t qs[QK_K/4];
|
133
|
-
|
134
|
-
|
137
|
+
uint8_t hmask[QK_K/8]; // quants - high bit
|
138
|
+
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
139
|
+
#ifdef GGML_QKK_64
|
140
|
+
uint8_t scales[2]; // scales, quantized with 8 bits
|
141
|
+
#else
|
142
|
+
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
143
|
+
#endif
|
144
|
+
half d; // super-block scale
|
135
145
|
} block_q3_K;
|
136
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 +
|
146
|
+
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
137
147
|
|
148
|
+
#ifdef GGML_QKK_64
|
149
|
+
typedef struct {
|
150
|
+
half d[2]; // super-block scales/mins
|
151
|
+
uint8_t scales[2]; // 4-bit block scales/mins
|
152
|
+
uint8_t qs[QK_K/2]; // 4--bit quants
|
153
|
+
} block_q4_K;
|
154
|
+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
155
|
+
#else
|
138
156
|
typedef struct {
|
139
157
|
half d; // super-block scale for quantized scales
|
140
158
|
half dmin; // super-block scale for quantized mins
|
@@ -142,15 +160,26 @@ typedef struct {
|
|
142
160
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
143
161
|
} block_q4_K;
|
144
162
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
163
|
+
#endif
|
145
164
|
|
165
|
+
#ifdef GGML_QKK_64
|
146
166
|
typedef struct {
|
147
|
-
half
|
148
|
-
|
149
|
-
uint8_t
|
167
|
+
half d; // super-block scale
|
168
|
+
int8_t scales[QK_K/16]; // block scales
|
169
|
+
uint8_t qh[QK_K/8]; // quants, high bit
|
170
|
+
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
171
|
+
} block_q5_K;
|
172
|
+
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
173
|
+
#else
|
174
|
+
typedef struct {
|
175
|
+
half d; // super-block scale for quantized scales
|
176
|
+
half dmin; // super-block scale for quantized mins
|
177
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
150
178
|
uint8_t qh[QK_K/8]; // quants, high bit
|
151
179
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
152
180
|
} block_q5_K;
|
153
|
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) +
|
181
|
+
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
182
|
+
#endif
|
154
183
|
|
155
184
|
typedef struct {
|
156
185
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
@@ -349,13 +378,14 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
349
378
|
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
350
379
|
|
351
380
|
const int i = blockIdx.x;
|
381
|
+
const block_q2_K * x = (const block_q2_K *) vx;
|
382
|
+
|
352
383
|
const int tid = threadIdx.x;
|
384
|
+
#if QK_K == 256
|
353
385
|
const int n = tid/32;
|
354
386
|
const int l = tid - 32*n;
|
355
387
|
const int is = 8*n + l/16;
|
356
388
|
|
357
|
-
const block_q2_K * x = (const block_q2_K *) vx;
|
358
|
-
|
359
389
|
const uint8_t q = x[i].qs[32*n + l];
|
360
390
|
float * y = yy + i*QK_K + 128*n;
|
361
391
|
|
@@ -365,21 +395,32 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
365
395
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
366
396
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
367
397
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
398
|
+
#else
|
399
|
+
const int is = tid/16; // 0 or 1
|
400
|
+
const int il = tid%16; // 0...15
|
401
|
+
const uint8_t q = x[i].qs[il] >> (2*is);
|
402
|
+
float * y = yy + i*QK_K + 16*is + il;
|
403
|
+
float dall = x[i].d;
|
404
|
+
float dmin = x[i].dmin;
|
405
|
+
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
406
|
+
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
407
|
+
#endif
|
368
408
|
|
369
409
|
}
|
370
410
|
|
371
411
|
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
372
412
|
|
373
|
-
int
|
374
|
-
int i = blockIdx.x;
|
375
|
-
int tid = r/2;
|
376
|
-
int is0 = r%2;
|
377
|
-
int l0 = 16*is0 + 4*(threadIdx.x%4);
|
378
|
-
int n = tid / 4;
|
379
|
-
int j = tid - 4*n;
|
380
|
-
|
413
|
+
const int i = blockIdx.x;
|
381
414
|
const block_q3_K * x = (const block_q3_K *) vx;
|
382
415
|
|
416
|
+
#if QK_K == 256
|
417
|
+
const int r = threadIdx.x/4;
|
418
|
+
const int tid = r/2;
|
419
|
+
const int is0 = r%2;
|
420
|
+
const int l0 = 16*is0 + 4*(threadIdx.x%4);
|
421
|
+
const int n = tid / 4;
|
422
|
+
const int j = tid - 4*n;
|
423
|
+
|
383
424
|
uint8_t m = 1 << (4*n + j);
|
384
425
|
int is = 8*n + 2*j + is0;
|
385
426
|
int shift = 2*j;
|
@@ -396,9 +437,31 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
|
396
437
|
const uint8_t * hm = x[i].hmask;
|
397
438
|
|
398
439
|
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
440
|
+
#else
|
441
|
+
const int tid = threadIdx.x;
|
442
|
+
const int is = tid/16; // 0 or 1
|
443
|
+
const int il = tid%16; // 0...15
|
444
|
+
const int im = il/8; // 0...1
|
445
|
+
const int in = il%8; // 0...7
|
446
|
+
|
447
|
+
float * y = yy + i*QK_K + 16*is + il;
|
448
|
+
|
449
|
+
const uint8_t q = x[i].qs[il] >> (2*is);
|
450
|
+
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
451
|
+
const float d = (float)x[i].d;
|
452
|
+
|
453
|
+
if (is == 0) {
|
454
|
+
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
455
|
+
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
456
|
+
} else {
|
457
|
+
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
458
|
+
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
459
|
+
}
|
460
|
+
#endif
|
399
461
|
|
400
462
|
}
|
401
463
|
|
464
|
+
#if QK_K == 256
|
402
465
|
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
403
466
|
if (j < 4) {
|
404
467
|
d = q[j] & 63; m = q[j + 4] & 63;
|
@@ -407,19 +470,14 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
407
470
|
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
408
471
|
}
|
409
472
|
}
|
473
|
+
#endif
|
410
474
|
|
411
475
|
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
412
476
|
const block_q4_K * x = (const block_q4_K *) vx;
|
413
477
|
|
414
478
|
const int i = blockIdx.x;
|
415
479
|
|
416
|
-
|
417
|
-
//const int tid = threadIdx.x;
|
418
|
-
//const int il = tid/16;
|
419
|
-
//const int ir = tid%16;
|
420
|
-
//const int is = 2*il;
|
421
|
-
//const int n = 2;
|
422
|
-
|
480
|
+
#if QK_K == 256
|
423
481
|
// assume 32 threads
|
424
482
|
const int tid = threadIdx.x;
|
425
483
|
const int il = tid/8;
|
@@ -443,6 +501,15 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
443
501
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
444
502
|
y[l +32] = d2 * (q[l] >> 4) - m2;
|
445
503
|
}
|
504
|
+
#else
|
505
|
+
const int tid = threadIdx.x;
|
506
|
+
const uint8_t * q = x[i].qs;
|
507
|
+
float * y = yy + i*QK_K;
|
508
|
+
const float d = (float)x[i].d[0];
|
509
|
+
const float m = (float)x[i].d[1];
|
510
|
+
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
511
|
+
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
512
|
+
#endif
|
446
513
|
}
|
447
514
|
|
448
515
|
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
@@ -450,6 +517,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
450
517
|
|
451
518
|
const int i = blockIdx.x;
|
452
519
|
|
520
|
+
#if QK_K == 256
|
453
521
|
// assume 64 threads - this is very slightly better than the one below
|
454
522
|
const int tid = threadIdx.x;
|
455
523
|
const int il = tid/16; // il is in 0...3
|
@@ -476,12 +544,25 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
476
544
|
hm <<= 1;
|
477
545
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
478
546
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
547
|
+
#else
|
548
|
+
const int tid = threadIdx.x;
|
549
|
+
const uint8_t q = x[i].qs[tid];
|
550
|
+
const int im = tid/8; // 0...3
|
551
|
+
const int in = tid%8; // 0...7
|
552
|
+
const int is = tid/16; // 0 or 1
|
553
|
+
const uint8_t h = x[i].qh[in] >> im;
|
554
|
+
const float d = x[i].d;
|
555
|
+
float * y = yy + i*QK_K + tid;
|
556
|
+
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
557
|
+
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
558
|
+
#endif
|
479
559
|
}
|
480
560
|
|
481
561
|
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
482
562
|
const block_q6_K * x = (const block_q6_K *) vx;
|
483
563
|
|
484
564
|
const int i = blockIdx.x;
|
565
|
+
#if QK_K == 256
|
485
566
|
|
486
567
|
// assume 64 threads - this is very slightly better than the one below
|
487
568
|
const int tid = threadIdx.x;
|
@@ -501,6 +582,24 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
501
582
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
502
583
|
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
503
584
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
585
|
+
#else
|
586
|
+
|
587
|
+
// assume 32 threads
|
588
|
+
const int tid = threadIdx.x;
|
589
|
+
const int ip = tid/16; // 0 or 1
|
590
|
+
const int il = tid - 16*ip; // 0...15
|
591
|
+
|
592
|
+
float * y = yy + i*QK_K + 16*ip + il;
|
593
|
+
|
594
|
+
const float d = x[i].d;
|
595
|
+
|
596
|
+
const uint8_t ql = x[i].ql[16*ip + il];
|
597
|
+
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
598
|
+
const int8_t * sc = x[i].scales;
|
599
|
+
|
600
|
+
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
601
|
+
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
602
|
+
#endif
|
504
603
|
}
|
505
604
|
|
506
605
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
@@ -515,6 +614,9 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
515
614
|
|
516
615
|
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
517
616
|
|
617
|
+
float tmp = 0; // partial sum for thread in warp
|
618
|
+
|
619
|
+
#if QK_K == 256
|
518
620
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
519
621
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
520
622
|
|
@@ -528,8 +630,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
528
630
|
const int s_offset = 8*im;
|
529
631
|
const int y_offset = 128*im + l0;
|
530
632
|
|
531
|
-
float tmp = 0; // partial sum for thread in warp
|
532
|
-
|
533
633
|
uint32_t aux[4];
|
534
634
|
const uint8_t * d = (const uint8_t *)aux;
|
535
635
|
const uint8_t * m = (const uint8_t *)(aux + 2);
|
@@ -565,6 +665,39 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
565
665
|
tmp += dall * sum1 - dmin * sum2;
|
566
666
|
|
567
667
|
}
|
668
|
+
#else
|
669
|
+
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
670
|
+
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
671
|
+
const int offset = tid * K_QUANTS_PER_ITERATION;
|
672
|
+
|
673
|
+
uint32_t uaux[2];
|
674
|
+
const uint8_t * d = (const uint8_t *)uaux;
|
675
|
+
|
676
|
+
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
677
|
+
|
678
|
+
const float * y = yy + i * QK_K + offset;
|
679
|
+
const uint8_t * q = x[i].qs + offset;
|
680
|
+
const uint32_t * s = (const uint32_t *)x[i].scales;
|
681
|
+
|
682
|
+
uaux[0] = s[0] & 0x0f0f0f0f;
|
683
|
+
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
684
|
+
|
685
|
+
const half2 * dh = (const half2 *)&x[i].d;
|
686
|
+
|
687
|
+
const float2 dall = __half22float2(dh[0]);
|
688
|
+
|
689
|
+
float sum1 = 0, sum2 = 0;
|
690
|
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
691
|
+
const uint8_t ql = q[l];
|
692
|
+
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
693
|
+
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
694
|
+
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
695
|
+
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
696
|
+
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
697
|
+
}
|
698
|
+
tmp += dall.x * sum1 - dall.y * sum2;
|
699
|
+
}
|
700
|
+
#endif
|
568
701
|
|
569
702
|
// sum up partial sums and write back result
|
570
703
|
__syncthreads();
|
@@ -573,16 +706,13 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
573
706
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
574
707
|
}
|
575
708
|
|
576
|
-
if (
|
709
|
+
if (threadIdx.x == 0) {
|
577
710
|
dst[row] = tmp;
|
578
711
|
}
|
579
712
|
}
|
580
713
|
|
581
714
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
582
715
|
|
583
|
-
const uint16_t kmask1 = 0x0303;
|
584
|
-
const uint16_t kmask2 = 0x0f0f;
|
585
|
-
|
586
716
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
587
717
|
if (row > nrows) return;
|
588
718
|
|
@@ -591,6 +721,13 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
591
721
|
|
592
722
|
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
593
723
|
|
724
|
+
float tmp = 0; // partial sum for thread in warp
|
725
|
+
|
726
|
+
#if QK_K == 256
|
727
|
+
|
728
|
+
const uint16_t kmask1 = 0x0303;
|
729
|
+
const uint16_t kmask2 = 0x0f0f;
|
730
|
+
|
594
731
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
595
732
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
596
733
|
|
@@ -610,8 +747,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
610
747
|
|
611
748
|
const uint16_t s_shift = 4*im;
|
612
749
|
|
613
|
-
float tmp = 0; // partial sum for thread in warp
|
614
|
-
|
615
750
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
616
751
|
|
617
752
|
const float * y = yy + i * QK_K + y_offset;
|
@@ -640,6 +775,34 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
640
775
|
tmp += d * sum;
|
641
776
|
|
642
777
|
}
|
778
|
+
#else
|
779
|
+
|
780
|
+
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
781
|
+
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
782
|
+
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
783
|
+
const int in = offset/8; // 0 or 1
|
784
|
+
const int im = offset%8; // 0...7
|
785
|
+
|
786
|
+
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
787
|
+
|
788
|
+
const float * y = yy + i * QK_K + offset;
|
789
|
+
const uint8_t * q = x[i].qs + offset;
|
790
|
+
const uint8_t * s = x[i].scales;
|
791
|
+
|
792
|
+
const float dall = (float)x[i].d;
|
793
|
+
|
794
|
+
float sum = 0;
|
795
|
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
796
|
+
const uint8_t hl = x[i].hmask[im+l] >> in;
|
797
|
+
const uint8_t ql = q[l];
|
798
|
+
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
799
|
+
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
800
|
+
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
801
|
+
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
802
|
+
}
|
803
|
+
tmp += sum;
|
804
|
+
}
|
805
|
+
#endif
|
643
806
|
|
644
807
|
// sum up partial sums and write back result
|
645
808
|
__syncthreads();
|
@@ -648,22 +811,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
648
811
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
649
812
|
}
|
650
813
|
|
651
|
-
if (
|
814
|
+
if (threadIdx.x == 0) {
|
652
815
|
dst[row] = tmp;
|
653
816
|
}
|
654
817
|
}
|
655
818
|
|
656
819
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
657
820
|
|
658
|
-
const uint16_t kmask1 = 0x3f3f;
|
659
|
-
const uint16_t kmask2 = 0x0f0f;
|
660
|
-
const uint16_t kmask3 = 0xc0c0;
|
661
|
-
|
662
821
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
663
822
|
if (row > nrows) return;
|
664
823
|
const int num_blocks_per_row = ncols / QK_K;
|
665
824
|
const int ib0 = row*num_blocks_per_row;
|
666
825
|
|
826
|
+
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
827
|
+
|
828
|
+
#if QK_K == 256
|
829
|
+
const uint16_t kmask1 = 0x3f3f;
|
830
|
+
const uint16_t kmask2 = 0x0f0f;
|
831
|
+
const uint16_t kmask3 = 0xc0c0;
|
832
|
+
|
667
833
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
668
834
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
669
835
|
|
@@ -683,8 +849,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
683
849
|
uint16_t aux[4];
|
684
850
|
const uint8_t * sc = (const uint8_t *)aux;
|
685
851
|
|
686
|
-
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
687
|
-
|
688
852
|
float tmp = 0; // partial sum for thread in warp
|
689
853
|
|
690
854
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
@@ -713,6 +877,36 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
713
877
|
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
714
878
|
|
715
879
|
}
|
880
|
+
#else
|
881
|
+
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
882
|
+
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
883
|
+
|
884
|
+
const int step = tid * K_QUANTS_PER_ITERATION;
|
885
|
+
|
886
|
+
uint16_t aux16[2];
|
887
|
+
const uint8_t * s = (const uint8_t *)aux16;
|
888
|
+
|
889
|
+
float tmp = 0;
|
890
|
+
|
891
|
+
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
892
|
+
const uint8_t * q = x[i].qs + step;
|
893
|
+
const float * y = yy + i*QK_K + step;
|
894
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
895
|
+
aux16[0] = a[0] & 0x0f0f;
|
896
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
897
|
+
const float d = (float)x[i].d[0];
|
898
|
+
const float m = (float)x[i].d[1];
|
899
|
+
float sum = 0.f;
|
900
|
+
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
901
|
+
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
902
|
+
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
903
|
+
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
904
|
+
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
905
|
+
}
|
906
|
+
tmp += sum;
|
907
|
+
}
|
908
|
+
|
909
|
+
#endif
|
716
910
|
|
717
911
|
// sum up partial sums and write back result
|
718
912
|
__syncthreads();
|
@@ -728,15 +922,19 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
728
922
|
|
729
923
|
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
730
924
|
|
731
|
-
const uint16_t kmask1 = 0x3f3f;
|
732
|
-
const uint16_t kmask2 = 0x0f0f;
|
733
|
-
const uint16_t kmask3 = 0xc0c0;
|
734
|
-
|
735
|
-
//const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
736
925
|
const int row = blockIdx.x;
|
737
926
|
const int num_blocks_per_row = ncols / QK_K;
|
738
927
|
const int ib0 = row*num_blocks_per_row;
|
739
928
|
|
929
|
+
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
930
|
+
|
931
|
+
float tmp = 0; // partial sum for thread in warp
|
932
|
+
|
933
|
+
#if QK_K == 256
|
934
|
+
const uint16_t kmask1 = 0x3f3f;
|
935
|
+
const uint16_t kmask2 = 0x0f0f;
|
936
|
+
const uint16_t kmask3 = 0xc0c0;
|
937
|
+
|
740
938
|
const int tid = threadIdx.x/2; // 0...15
|
741
939
|
const int ix = threadIdx.x%2;
|
742
940
|
|
@@ -757,10 +955,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
757
955
|
uint16_t aux[4];
|
758
956
|
const uint8_t * sc = (const uint8_t *)aux;
|
759
957
|
|
760
|
-
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
761
|
-
|
762
|
-
float tmp = 0; // partial sum for thread in warp
|
763
|
-
|
764
958
|
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
765
959
|
|
766
960
|
const uint8_t * ql1 = x[i].qs + q_offset;
|
@@ -793,8 +987,31 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
793
987
|
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
794
988
|
}
|
795
989
|
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
990
|
+
}
|
796
991
|
|
992
|
+
#else
|
993
|
+
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
994
|
+
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
995
|
+
const int step = tid * K_QUANTS_PER_ITERATION;
|
996
|
+
const int im = step/8;
|
997
|
+
const int in = step%8;
|
998
|
+
|
999
|
+
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
1000
|
+
const uint8_t * q = x[i].qs + step;
|
1001
|
+
const int8_t * s = x[i].scales;
|
1002
|
+
const float * y = yy + i*QK_K + step;
|
1003
|
+
const float d = x[i].d;
|
1004
|
+
float sum = 0.f;
|
1005
|
+
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
1006
|
+
const uint8_t h = x[i].qh[in+j] >> im;
|
1007
|
+
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
1008
|
+
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
1009
|
+
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
1010
|
+
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
1011
|
+
}
|
1012
|
+
tmp += sum;
|
797
1013
|
}
|
1014
|
+
#endif
|
798
1015
|
|
799
1016
|
// sum up partial sums and write back result
|
800
1017
|
__syncthreads();
|
@@ -803,7 +1020,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
803
1020
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
804
1021
|
}
|
805
1022
|
|
806
|
-
if (
|
1023
|
+
if (threadIdx.x == 0) {
|
807
1024
|
dst[row] = tmp;
|
808
1025
|
}
|
809
1026
|
}
|
@@ -820,6 +1037,8 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
|
820
1037
|
|
821
1038
|
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
822
1039
|
|
1040
|
+
#if QK_K == 256
|
1041
|
+
|
823
1042
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
824
1043
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
825
1044
|
|
@@ -874,6 +1093,37 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
|
874
1093
|
|
875
1094
|
}
|
876
1095
|
|
1096
|
+
#else
|
1097
|
+
|
1098
|
+
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
|
1099
|
+
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
|
1100
|
+
|
1101
|
+
const int step = tid * K_QUANTS_PER_ITERATION;
|
1102
|
+
|
1103
|
+
float tmp = 0; // partial sum for thread in warp
|
1104
|
+
|
1105
|
+
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
1106
|
+
|
1107
|
+
const float * y = yy + i * QK_K + step;
|
1108
|
+
const uint8_t * ql = x[i].ql + step;
|
1109
|
+
const uint8_t * qh = x[i].qh + step;
|
1110
|
+
const int8_t * s = x[i].scales;
|
1111
|
+
|
1112
|
+
const float d = x[i+0].d;
|
1113
|
+
|
1114
|
+
float sum = 0;
|
1115
|
+
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
1116
|
+
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
1117
|
+
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
1118
|
+
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
1119
|
+
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
1120
|
+
}
|
1121
|
+
tmp += sum;
|
1122
|
+
|
1123
|
+
}
|
1124
|
+
|
1125
|
+
#endif
|
1126
|
+
|
877
1127
|
// sum up partial sums and write back result
|
878
1128
|
__syncthreads();
|
879
1129
|
#pragma unroll
|
@@ -1252,12 +1502,20 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
|
|
1252
1502
|
|
1253
1503
|
static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1254
1504
|
const int nb = k / QK_K;
|
1505
|
+
#if QK_K == 256
|
1255
1506
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
1507
|
+
#else
|
1508
|
+
dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
|
1509
|
+
#endif
|
1256
1510
|
}
|
1257
1511
|
|
1258
1512
|
static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1259
1513
|
const int nb = k / QK_K;
|
1514
|
+
#if QK_K == 256
|
1260
1515
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
1516
|
+
#else
|
1517
|
+
dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
|
1518
|
+
#endif
|
1261
1519
|
}
|
1262
1520
|
|
1263
1521
|
static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -1267,12 +1525,20 @@ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cu
|
|
1267
1525
|
|
1268
1526
|
static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1269
1527
|
const int nb = k / QK_K;
|
1528
|
+
#if QK_K == 256
|
1270
1529
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
1530
|
+
#else
|
1531
|
+
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
|
1532
|
+
#endif
|
1271
1533
|
}
|
1272
1534
|
|
1273
1535
|
static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
1274
1536
|
const int nb = k / QK_K;
|
1537
|
+
#if QK_K == 256
|
1275
1538
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
1539
|
+
#else
|
1540
|
+
dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
|
1541
|
+
#endif
|
1276
1542
|
}
|
1277
1543
|
|
1278
1544
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
@@ -2553,6 +2819,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2553
2819
|
|
2554
2820
|
tensor->backend = GGML_BACKEND_GPU;
|
2555
2821
|
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
2822
|
+
memset(extra, 0, sizeof(*extra));
|
2556
2823
|
|
2557
2824
|
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
2558
2825
|
tensor->op == GGML_OP_VIEW;
|
@@ -2635,7 +2902,7 @@ void ggml_cuda_free_scratch() {
|
|
2635
2902
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
2636
2903
|
ggml_cuda_func_t func;
|
2637
2904
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
2638
|
-
|| tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
|
2905
|
+
|| (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
|
2639
2906
|
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
|
2640
2907
|
|
2641
2908
|
switch (tensor->op) {
|