llama_cpp 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -117,7 +117,13 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
117
117
 
118
118
  //================================= k-quants
119
119
 
120
+ #ifdef GGML_QKK_64
121
+ #define QK_K 64
122
+ #define K_SCALE_SIZE 4
123
+ #else
120
124
  #define QK_K 256
125
+ #define K_SCALE_SIZE 12
126
+ #endif
121
127
 
122
128
  typedef struct {
123
129
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
@@ -128,13 +134,25 @@ typedef struct {
128
134
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
129
135
 
130
136
  typedef struct {
131
- uint8_t hmask[QK_K/8];
132
- uint8_t qs[QK_K/4]; // nibbles / quants
133
- uint8_t scales[3*QK_K/64];
134
- half d;
137
+ uint8_t hmask[QK_K/8]; // quants - high bit
138
+ uint8_t qs[QK_K/4]; // quants - low 2 bits
139
+ #ifdef GGML_QKK_64
140
+ uint8_t scales[2]; // scales, quantized with 8 bits
141
+ #else
142
+ uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
143
+ #endif
144
+ half d; // super-block scale
135
145
  } block_q3_K;
136
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
146
+ //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
137
147
 
148
+ #ifdef GGML_QKK_64
149
+ typedef struct {
150
+ half d[2]; // super-block scales/mins
151
+ uint8_t scales[2]; // 4-bit block scales/mins
152
+ uint8_t qs[QK_K/2]; // 4--bit quants
153
+ } block_q4_K;
154
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
155
+ #else
138
156
  typedef struct {
139
157
  half d; // super-block scale for quantized scales
140
158
  half dmin; // super-block scale for quantized mins
@@ -142,15 +160,26 @@ typedef struct {
142
160
  uint8_t qs[QK_K/2]; // 4--bit quants
143
161
  } block_q4_K;
144
162
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
163
+ #endif
145
164
 
165
+ #ifdef GGML_QKK_64
166
+ typedef struct {
167
+ half d; // super-block scale
168
+ int8_t scales[QK_K/16]; // block scales
169
+ uint8_t qh[QK_K/8]; // quants, high bit
170
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
171
+ } block_q5_K;
172
+ static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
173
+ #else
146
174
  typedef struct {
147
- half d; // super-block scale for quantized scales
148
- half dmin; // super-block scale for quantized mins
149
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
175
+ half d; // super-block scale for quantized scales
176
+ half dmin; // super-block scale for quantized mins
177
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
150
178
  uint8_t qh[QK_K/8]; // quants, high bit
151
179
  uint8_t qs[QK_K/2]; // quants, low 4 bits
152
180
  } block_q5_K;
153
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
181
+ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
182
+ #endif
154
183
 
155
184
  typedef struct {
156
185
  uint8_t ql[QK_K/2]; // quants, lower 4 bits
@@ -194,6 +223,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
194
223
  dst[i] = x[i] + y[i];
195
224
  }
196
225
 
226
+ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
227
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
228
+
229
+ if (i >= k) {
230
+ return;
231
+ }
232
+ dst[i] = __hadd(x[i], __float2half(y[i]));
233
+ }
234
+
197
235
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
198
236
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
199
237
 
@@ -349,13 +387,14 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
349
387
  static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
350
388
 
351
389
  const int i = blockIdx.x;
390
+ const block_q2_K * x = (const block_q2_K *) vx;
391
+
352
392
  const int tid = threadIdx.x;
393
+ #if QK_K == 256
353
394
  const int n = tid/32;
354
395
  const int l = tid - 32*n;
355
396
  const int is = 8*n + l/16;
356
397
 
357
- const block_q2_K * x = (const block_q2_K *) vx;
358
-
359
398
  const uint8_t q = x[i].qs[32*n + l];
360
399
  float * y = yy + i*QK_K + 128*n;
361
400
 
@@ -365,21 +404,32 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
365
404
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
366
405
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
367
406
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
407
+ #else
408
+ const int is = tid/16; // 0 or 1
409
+ const int il = tid%16; // 0...15
410
+ const uint8_t q = x[i].qs[il] >> (2*is);
411
+ float * y = yy + i*QK_K + 16*is + il;
412
+ float dall = x[i].d;
413
+ float dmin = x[i].dmin;
414
+ y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
415
+ y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
416
+ #endif
368
417
 
369
418
  }
370
419
 
371
420
  static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
372
421
 
373
- int r = threadIdx.x/4;
374
- int i = blockIdx.x;
375
- int tid = r/2;
376
- int is0 = r%2;
377
- int l0 = 16*is0 + 4*(threadIdx.x%4);
378
- int n = tid / 4;
379
- int j = tid - 4*n;
380
-
422
+ const int i = blockIdx.x;
381
423
  const block_q3_K * x = (const block_q3_K *) vx;
382
424
 
425
+ #if QK_K == 256
426
+ const int r = threadIdx.x/4;
427
+ const int tid = r/2;
428
+ const int is0 = r%2;
429
+ const int l0 = 16*is0 + 4*(threadIdx.x%4);
430
+ const int n = tid / 4;
431
+ const int j = tid - 4*n;
432
+
383
433
  uint8_t m = 1 << (4*n + j);
384
434
  int is = 8*n + 2*j + is0;
385
435
  int shift = 2*j;
@@ -396,9 +446,31 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
396
446
  const uint8_t * hm = x[i].hmask;
397
447
 
398
448
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
449
+ #else
450
+ const int tid = threadIdx.x;
451
+ const int is = tid/16; // 0 or 1
452
+ const int il = tid%16; // 0...15
453
+ const int im = il/8; // 0...1
454
+ const int in = il%8; // 0...7
455
+
456
+ float * y = yy + i*QK_K + 16*is + il;
457
+
458
+ const uint8_t q = x[i].qs[il] >> (2*is);
459
+ const uint8_t h = x[i].hmask[in] >> (2*is + im);
460
+ const float d = (float)x[i].d;
461
+
462
+ if (is == 0) {
463
+ y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
464
+ y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
465
+ } else {
466
+ y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
467
+ y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
468
+ }
469
+ #endif
399
470
 
400
471
  }
401
472
 
473
+ #if QK_K == 256
402
474
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
403
475
  if (j < 4) {
404
476
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -407,19 +479,14 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
407
479
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
408
480
  }
409
481
  }
482
+ #endif
410
483
 
411
484
  static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
412
485
  const block_q4_K * x = (const block_q4_K *) vx;
413
486
 
414
487
  const int i = blockIdx.x;
415
488
 
416
- //// assume 64 threads - this is very slightly better than the one below
417
- //const int tid = threadIdx.x;
418
- //const int il = tid/16;
419
- //const int ir = tid%16;
420
- //const int is = 2*il;
421
- //const int n = 2;
422
-
489
+ #if QK_K == 256
423
490
  // assume 32 threads
424
491
  const int tid = threadIdx.x;
425
492
  const int il = tid/8;
@@ -443,6 +510,15 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
443
510
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
444
511
  y[l +32] = d2 * (q[l] >> 4) - m2;
445
512
  }
513
+ #else
514
+ const int tid = threadIdx.x;
515
+ const uint8_t * q = x[i].qs;
516
+ float * y = yy + i*QK_K;
517
+ const float d = (float)x[i].d[0];
518
+ const float m = (float)x[i].d[1];
519
+ y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
520
+ y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
521
+ #endif
446
522
  }
447
523
 
448
524
  static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
@@ -450,6 +526,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
450
526
 
451
527
  const int i = blockIdx.x;
452
528
 
529
+ #if QK_K == 256
453
530
  // assume 64 threads - this is very slightly better than the one below
454
531
  const int tid = threadIdx.x;
455
532
  const int il = tid/16; // il is in 0...3
@@ -476,12 +553,25 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
476
553
  hm <<= 1;
477
554
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
478
555
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
556
+ #else
557
+ const int tid = threadIdx.x;
558
+ const uint8_t q = x[i].qs[tid];
559
+ const int im = tid/8; // 0...3
560
+ const int in = tid%8; // 0...7
561
+ const int is = tid/16; // 0 or 1
562
+ const uint8_t h = x[i].qh[in] >> im;
563
+ const float d = x[i].d;
564
+ float * y = yy + i*QK_K + tid;
565
+ y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
566
+ y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
567
+ #endif
479
568
  }
480
569
 
481
570
  static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
482
571
  const block_q6_K * x = (const block_q6_K *) vx;
483
572
 
484
573
  const int i = blockIdx.x;
574
+ #if QK_K == 256
485
575
 
486
576
  // assume 64 threads - this is very slightly better than the one below
487
577
  const int tid = threadIdx.x;
@@ -501,6 +591,24 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
501
591
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
502
592
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
503
593
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
594
+ #else
595
+
596
+ // assume 32 threads
597
+ const int tid = threadIdx.x;
598
+ const int ip = tid/16; // 0 or 1
599
+ const int il = tid - 16*ip; // 0...15
600
+
601
+ float * y = yy + i*QK_K + 16*ip + il;
602
+
603
+ const float d = x[i].d;
604
+
605
+ const uint8_t ql = x[i].ql[16*ip + il];
606
+ const uint8_t qh = x[i].qh[il] >> (2*ip);
607
+ const int8_t * sc = x[i].scales;
608
+
609
+ y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
610
+ y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
611
+ #endif
504
612
  }
505
613
 
506
614
  static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
@@ -515,6 +623,9 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
515
623
 
516
624
  const block_q2_K * x = (const block_q2_K *)vx + ib0;
517
625
 
626
+ float tmp = 0; // partial sum for thread in warp
627
+
628
+ #if QK_K == 256
518
629
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
519
630
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
520
631
 
@@ -528,8 +639,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
528
639
  const int s_offset = 8*im;
529
640
  const int y_offset = 128*im + l0;
530
641
 
531
- float tmp = 0; // partial sum for thread in warp
532
-
533
642
  uint32_t aux[4];
534
643
  const uint8_t * d = (const uint8_t *)aux;
535
644
  const uint8_t * m = (const uint8_t *)(aux + 2);
@@ -565,6 +674,39 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
565
674
  tmp += dall * sum1 - dmin * sum2;
566
675
 
567
676
  }
677
+ #else
678
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
679
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
680
+ const int offset = tid * K_QUANTS_PER_ITERATION;
681
+
682
+ uint32_t uaux[2];
683
+ const uint8_t * d = (const uint8_t *)uaux;
684
+
685
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
686
+
687
+ const float * y = yy + i * QK_K + offset;
688
+ const uint8_t * q = x[i].qs + offset;
689
+ const uint32_t * s = (const uint32_t *)x[i].scales;
690
+
691
+ uaux[0] = s[0] & 0x0f0f0f0f;
692
+ uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
693
+
694
+ const half2 * dh = (const half2 *)&x[i].d;
695
+
696
+ const float2 dall = __half22float2(dh[0]);
697
+
698
+ float sum1 = 0, sum2 = 0;
699
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
700
+ const uint8_t ql = q[l];
701
+ sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
702
+ + y[l+16] * d[1] * ((ql >> 2) & 3)
703
+ + y[l+32] * d[2] * ((ql >> 4) & 3)
704
+ + y[l+48] * d[3] * ((ql >> 6) & 3);
705
+ sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
706
+ }
707
+ tmp += dall.x * sum1 - dall.y * sum2;
708
+ }
709
+ #endif
568
710
 
569
711
  // sum up partial sums and write back result
570
712
  __syncthreads();
@@ -573,16 +715,13 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
573
715
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
574
716
  }
575
717
 
576
- if (tid == 0) {
718
+ if (threadIdx.x == 0) {
577
719
  dst[row] = tmp;
578
720
  }
579
721
  }
580
722
 
581
723
  static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
582
724
 
583
- const uint16_t kmask1 = 0x0303;
584
- const uint16_t kmask2 = 0x0f0f;
585
-
586
725
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
587
726
  if (row > nrows) return;
588
727
 
@@ -591,6 +730,13 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
591
730
 
592
731
  const block_q3_K * x = (const block_q3_K *)vx + ib0;
593
732
 
733
+ float tmp = 0; // partial sum for thread in warp
734
+
735
+ #if QK_K == 256
736
+
737
+ const uint16_t kmask1 = 0x0303;
738
+ const uint16_t kmask2 = 0x0f0f;
739
+
594
740
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
595
741
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
596
742
 
@@ -610,8 +756,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
610
756
 
611
757
  const uint16_t s_shift = 4*im;
612
758
 
613
- float tmp = 0; // partial sum for thread in warp
614
-
615
759
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
616
760
 
617
761
  const float * y = yy + i * QK_K + y_offset;
@@ -640,6 +784,34 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
640
784
  tmp += d * sum;
641
785
 
642
786
  }
787
+ #else
788
+
789
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
790
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
791
+ const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
792
+ const int in = offset/8; // 0 or 1
793
+ const int im = offset%8; // 0...7
794
+
795
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
796
+
797
+ const float * y = yy + i * QK_K + offset;
798
+ const uint8_t * q = x[i].qs + offset;
799
+ const uint8_t * s = x[i].scales;
800
+
801
+ const float dall = (float)x[i].d;
802
+
803
+ float sum = 0;
804
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
805
+ const uint8_t hl = x[i].hmask[im+l] >> in;
806
+ const uint8_t ql = q[l];
807
+ sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
808
+ + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
809
+ + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
810
+ + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
811
+ }
812
+ tmp += sum;
813
+ }
814
+ #endif
643
815
 
644
816
  // sum up partial sums and write back result
645
817
  __syncthreads();
@@ -648,22 +820,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
648
820
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
649
821
  }
650
822
 
651
- if (tid == 0) {
823
+ if (threadIdx.x == 0) {
652
824
  dst[row] = tmp;
653
825
  }
654
826
  }
655
827
 
656
828
  static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
657
829
 
658
- const uint16_t kmask1 = 0x3f3f;
659
- const uint16_t kmask2 = 0x0f0f;
660
- const uint16_t kmask3 = 0xc0c0;
661
-
662
830
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
663
831
  if (row > nrows) return;
664
832
  const int num_blocks_per_row = ncols / QK_K;
665
833
  const int ib0 = row*num_blocks_per_row;
666
834
 
835
+ const block_q4_K * x = (const block_q4_K *)vx + ib0;
836
+
837
+ #if QK_K == 256
838
+ const uint16_t kmask1 = 0x3f3f;
839
+ const uint16_t kmask2 = 0x0f0f;
840
+ const uint16_t kmask3 = 0xc0c0;
841
+
667
842
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
668
843
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
669
844
 
@@ -683,8 +858,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
683
858
  uint16_t aux[4];
684
859
  const uint8_t * sc = (const uint8_t *)aux;
685
860
 
686
- const block_q4_K * x = (const block_q4_K *)vx + ib0;
687
-
688
861
  float tmp = 0; // partial sum for thread in warp
689
862
 
690
863
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
@@ -713,6 +886,36 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
713
886
  tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
714
887
 
715
888
  }
889
+ #else
890
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
891
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
892
+
893
+ const int step = tid * K_QUANTS_PER_ITERATION;
894
+
895
+ uint16_t aux16[2];
896
+ const uint8_t * s = (const uint8_t *)aux16;
897
+
898
+ float tmp = 0;
899
+
900
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
901
+ const uint8_t * q = x[i].qs + step;
902
+ const float * y = yy + i*QK_K + step;
903
+ const uint16_t * a = (const uint16_t *)x[i].scales;
904
+ aux16[0] = a[0] & 0x0f0f;
905
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
906
+ const float d = (float)x[i].d[0];
907
+ const float m = (float)x[i].d[1];
908
+ float sum = 0.f;
909
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
910
+ sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
911
+ + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
912
+ + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
913
+ + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
914
+ }
915
+ tmp += sum;
916
+ }
917
+
918
+ #endif
716
919
 
717
920
  // sum up partial sums and write back result
718
921
  __syncthreads();
@@ -728,15 +931,19 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
728
931
 
729
932
  static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
730
933
 
731
- const uint16_t kmask1 = 0x3f3f;
732
- const uint16_t kmask2 = 0x0f0f;
733
- const uint16_t kmask3 = 0xc0c0;
734
-
735
- //const int row = blockIdx.x*blockDim.y + threadIdx.y;
736
934
  const int row = blockIdx.x;
737
935
  const int num_blocks_per_row = ncols / QK_K;
738
936
  const int ib0 = row*num_blocks_per_row;
739
937
 
938
+ const block_q5_K * x = (const block_q5_K *)vx + ib0;
939
+
940
+ float tmp = 0; // partial sum for thread in warp
941
+
942
+ #if QK_K == 256
943
+ const uint16_t kmask1 = 0x3f3f;
944
+ const uint16_t kmask2 = 0x0f0f;
945
+ const uint16_t kmask3 = 0xc0c0;
946
+
740
947
  const int tid = threadIdx.x/2; // 0...15
741
948
  const int ix = threadIdx.x%2;
742
949
 
@@ -757,10 +964,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
757
964
  uint16_t aux[4];
758
965
  const uint8_t * sc = (const uint8_t *)aux;
759
966
 
760
- const block_q5_K * x = (const block_q5_K *)vx + ib0;
761
-
762
- float tmp = 0; // partial sum for thread in warp
763
-
764
967
  for (int i = ix; i < num_blocks_per_row; i += 2) {
765
968
 
766
969
  const uint8_t * ql1 = x[i].qs + q_offset;
@@ -793,8 +996,31 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
793
996
  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
794
997
  }
795
998
  tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
999
+ }
796
1000
 
1001
+ #else
1002
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
1003
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
1004
+ const int step = tid * K_QUANTS_PER_ITERATION;
1005
+ const int im = step/8;
1006
+ const int in = step%8;
1007
+
1008
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
1009
+ const uint8_t * q = x[i].qs + step;
1010
+ const int8_t * s = x[i].scales;
1011
+ const float * y = yy + i*QK_K + step;
1012
+ const float d = x[i].d;
1013
+ float sum = 0.f;
1014
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
1015
+ const uint8_t h = x[i].qh[in+j] >> im;
1016
+ sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
1017
+ + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
1018
+ + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
1019
+ + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
1020
+ }
1021
+ tmp += sum;
797
1022
  }
1023
+ #endif
798
1024
 
799
1025
  // sum up partial sums and write back result
800
1026
  __syncthreads();
@@ -803,7 +1029,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
803
1029
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
804
1030
  }
805
1031
 
806
- if (tid == 0) {
1032
+ if (threadIdx.x == 0) {
807
1033
  dst[row] = tmp;
808
1034
  }
809
1035
  }
@@ -820,6 +1046,8 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
820
1046
 
821
1047
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
822
1048
 
1049
+ #if QK_K == 256
1050
+
823
1051
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
824
1052
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
825
1053
 
@@ -874,6 +1102,37 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
874
1102
 
875
1103
  }
876
1104
 
1105
+ #else
1106
+
1107
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
1108
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
1109
+
1110
+ const int step = tid * K_QUANTS_PER_ITERATION;
1111
+
1112
+ float tmp = 0; // partial sum for thread in warp
1113
+
1114
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
1115
+
1116
+ const float * y = yy + i * QK_K + step;
1117
+ const uint8_t * ql = x[i].ql + step;
1118
+ const uint8_t * qh = x[i].qh + step;
1119
+ const int8_t * s = x[i].scales;
1120
+
1121
+ const float d = x[i+0].d;
1122
+
1123
+ float sum = 0;
1124
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
1125
+ sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
1126
+ + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
1127
+ + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
1128
+ + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
1129
+ }
1130
+ tmp += sum;
1131
+
1132
+ }
1133
+
1134
+ #endif
1135
+
877
1136
  // sum up partial sums and write back result
878
1137
  __syncthreads();
879
1138
  #pragma unroll
@@ -985,7 +1244,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
985
1244
  }
986
1245
 
987
1246
  static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
988
- const half * x = (half *) vx;
1247
+ const half * x = (const half *) vx;
989
1248
 
990
1249
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
991
1250
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1033,9 +1292,9 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1033
1292
 
1034
1293
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1035
1294
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1036
- const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
1295
+ const int row_stride_x, const int channel_stride_x) {
1037
1296
 
1038
- const half * x = (half *) vx;
1297
+ const half * x = (const half *) vx;
1039
1298
 
1040
1299
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1041
1300
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1078,14 +1337,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1078
1337
  }
1079
1338
 
1080
1339
  static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1081
- const float * xi = (float *) cxi;
1340
+ const float * xi = (const float *) cxi;
1082
1341
  float * dsti = (float *) cdsti;
1083
1342
 
1084
1343
  *dsti = *xi;
1085
1344
  }
1086
1345
 
1087
1346
  static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1088
- const float * xi = (float *) cxi;
1347
+ const float * xi = (const float *) cxi;
1089
1348
  half * dsti = (half *) cdsti;
1090
1349
 
1091
1350
  *dsti = __float2half(*xi);
@@ -1209,6 +1468,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
1209
1468
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1210
1469
  }
1211
1470
 
1471
+ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
1472
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1473
+ add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1474
+ }
1475
+
1212
1476
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
1213
1477
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
1214
1478
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -1252,12 +1516,20 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
1252
1516
 
1253
1517
  static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1254
1518
  const int nb = k / QK_K;
1519
+ #if QK_K == 256
1255
1520
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
1521
+ #else
1522
+ dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
1523
+ #endif
1256
1524
  }
1257
1525
 
1258
1526
  static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1259
1527
  const int nb = k / QK_K;
1528
+ #if QK_K == 256
1260
1529
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
1530
+ #else
1531
+ dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
1532
+ #endif
1261
1533
  }
1262
1534
 
1263
1535
  static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -1267,12 +1539,20 @@ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cu
1267
1539
 
1268
1540
  static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1269
1541
  const int nb = k / QK_K;
1542
+ #if QK_K == 256
1270
1543
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
1544
+ #else
1545
+ dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
1546
+ #endif
1271
1547
  }
1272
1548
 
1273
1549
  static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1274
1550
  const int nb = k / QK_K;
1551
+ #if QK_K == 256
1275
1552
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
1553
+ #else
1554
+ dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
1555
+ #endif
1276
1556
  }
1277
1557
 
1278
1558
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@@ -1418,7 +1698,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1418
1698
  const dim3 block_nums(1, nrows_x, nchannels_x);
1419
1699
  const dim3 block_dims(WARP_SIZE, 1, 1);
1420
1700
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1421
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
1701
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
1422
1702
  }
1423
1703
 
1424
1704
  static void ggml_cpy_f32_f32_cuda(
@@ -1675,7 +1955,7 @@ inline void ggml_cuda_op_add(
1675
1955
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1676
1956
  cudaStream_t & cudaStream_main){
1677
1957
 
1678
- GGML_ASSERT(src0_ddf_i != nullptr);
1958
+ GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
1679
1959
  GGML_ASSERT(src1_ddf_i != nullptr);
1680
1960
  GGML_ASSERT(dst_ddf_i != nullptr);
1681
1961
 
@@ -1683,7 +1963,13 @@ inline void ggml_cuda_op_add(
1683
1963
  const int64_t i01_diff = i01_high - i01_low;
1684
1964
 
1685
1965
  // compute
1686
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1966
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
1967
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1968
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
1969
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
1970
+ } else {
1971
+ GGML_ASSERT(false);
1972
+ }
1687
1973
  CUDA_CHECK(cudaGetLastError());
1688
1974
 
1689
1975
  (void) src1;
@@ -2281,8 +2567,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2281
2567
  }
2282
2568
 
2283
2569
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2284
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2285
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
2570
+ // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
2571
+ // Due to flatten_rows == true this does in practice not make a difference however.
2572
+ // Better solution would be nice but right now that would require disproportionate changes.
2573
+ GGML_ASSERT(
2574
+ (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
2575
+ src1->type == GGML_TYPE_F32 &&
2576
+ (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
2577
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
2286
2578
  }
2287
2579
 
2288
2580
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2535,7 +2827,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2535
2827
  delete extra;
2536
2828
  }
2537
2829
 
2538
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2830
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
2539
2831
  if (scratch && g_scratch_size == 0) {
2540
2832
  return;
2541
2833
  }
@@ -2544,22 +2836,24 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2544
2836
  if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2545
2837
  const ggml_op src0_op = tensor->src0->op;
2546
2838
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2547
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
2839
+ ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
2548
2840
  }
2549
2841
  }
2550
2842
  if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2551
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
2843
+ ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
2552
2844
  }
2553
2845
 
2554
2846
  tensor->backend = GGML_BACKEND_GPU;
2555
2847
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
2848
+ memset(extra, 0, sizeof(*extra));
2556
2849
 
2557
2850
  const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2558
- tensor->op == GGML_OP_VIEW;
2851
+ tensor->op == GGML_OP_VIEW ||
2852
+ force_inplace;
2559
2853
  const size_t size = ggml_nbytes(tensor);
2560
2854
 
2561
2855
  CUDA_CHECK(cudaSetDevice(g_main_device));
2562
- if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
2856
+ if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
2563
2857
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
2564
2858
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2565
2859
  size_t offset = 0;
@@ -2598,11 +2892,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2598
2892
  }
2599
2893
 
2600
2894
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2601
- ggml_cuda_assign_buffers_impl(tensor, true);
2895
+ ggml_cuda_assign_buffers_impl(tensor, true, false);
2602
2896
  }
2603
2897
 
2604
2898
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2605
- ggml_cuda_assign_buffers_impl(tensor, false);
2899
+ ggml_cuda_assign_buffers_impl(tensor, false, false);
2900
+ }
2901
+
2902
+ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
2903
+ ggml_cuda_assign_buffers_impl(tensor, false, true);
2606
2904
  }
2607
2905
 
2608
2906
  void ggml_cuda_set_main_device(int main_device) {
@@ -2635,7 +2933,7 @@ void ggml_cuda_free_scratch() {
2635
2933
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
2636
2934
  ggml_cuda_func_t func;
2637
2935
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
2638
- || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
2936
+ || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
2639
2937
  || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
2640
2938
 
2641
2939
  switch (tensor->op) {