llama_cpp 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,13 +52,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
52
52
  } while (0)
53
53
  #endif // CUDART_VERSION >= 11
54
54
 
55
- #ifdef GGML_CUDA_DMMV_F16
55
+ #ifdef GGML_CUDA_F16
56
56
  typedef half dfloat; // dequantize float
57
57
  typedef half2 dfloat2;
58
58
  #else
59
59
  typedef float dfloat; // dequantize float
60
60
  typedef float2 dfloat2;
61
- #endif //GGML_CUDA_DMMV_F16
61
+ #endif //GGML_CUDA_F16
62
+
63
+ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
64
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
65
+
66
+ int x32 = 0;
67
+ x32 |= x16[0] << 0;
68
+ x32 |= x16[1] << 16;
69
+
70
+ return x32;
71
+ }
72
+
73
+ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
74
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
75
+
76
+ int x32 = 0;
77
+ x32 |= x16[0] << 0;
78
+ x32 |= x16[1] << 16;
79
+
80
+ return x32;
81
+ }
82
+
83
+ static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
84
+ return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
85
+ }
86
+
87
+ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
88
+ return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
89
+ }
62
90
 
63
91
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
64
92
  typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
@@ -87,8 +115,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
87
115
  #define QR4_1 2
88
116
  #define QI4_1 (QK4_1 / (4 * QR4_1))
89
117
  typedef struct {
90
- half d; // delta
91
- half m; // min
118
+ half2 dm; // dm.x = delta, dm.y = min
92
119
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
93
120
  } block_q4_1;
94
121
  static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -107,8 +134,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
107
134
  #define QR5_1 2
108
135
  #define QI5_1 (QK5_1 / (4 * QR5_1))
109
136
  typedef struct {
110
- half d; // delta
111
- half m; // min
137
+ half2 dm; // dm.x = delta, dm.y = min
112
138
  uint8_t qh[4]; // 5-th bit of quants
113
139
  uint8_t qs[QK5_1 / 2]; // nibbles / quants
114
140
  } block_q5_1;
@@ -127,13 +153,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
127
153
  #define QR8_1 1
128
154
  #define QI8_1 (QK8_1 / (4 * QR8_1))
129
155
  typedef struct {
130
- half d; // delta
131
- half s; // unquantized sum
156
+ half2 ds; // ds.x = delta, ds.y = sum
132
157
  int8_t qs[QK8_0]; // quants
133
158
  } block_q8_1;
134
159
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
135
160
 
136
- typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
161
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
162
+ typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
163
+ typedef void (*load_tiles_cuda_t)(
164
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
165
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
166
+ typedef float (*vec_dot_q_mul_mat_cuda_t)(
167
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
168
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
137
169
 
138
170
  //================================= k-quants
139
171
 
@@ -150,8 +182,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
150
182
  typedef struct {
151
183
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
152
184
  uint8_t qs[QK_K/4]; // quants
153
- half d; // super-block scale for quantized scales
154
- half dmin; // super-block scale for quantized mins
185
+ half2 dm; // super-block scale for quantized scales/mins
155
186
  } block_q2_K;
156
187
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
157
188
 
@@ -180,8 +211,7 @@ typedef struct {
180
211
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
181
212
  #else
182
213
  typedef struct {
183
- half d; // super-block scale for quantized scales
184
- half dmin; // super-block scale for quantized mins
214
+ half2 dm; // super-block scale for quantized scales/mins
185
215
  uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
186
216
  uint8_t qs[QK_K/2]; // 4--bit quants
187
217
  } block_q4_K;
@@ -200,11 +230,10 @@ typedef struct {
200
230
  static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
201
231
  #else
202
232
  typedef struct {
203
- half d; // super-block scale for quantized scales
204
- half dmin; // super-block scale for quantized mins
205
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
206
- uint8_t qh[QK_K/8]; // quants, high bit
207
- uint8_t qs[QK_K/2]; // quants, low 4 bits
233
+ half2 dm; // super-block scale for quantized scales/mins
234
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
235
+ uint8_t qh[QK_K/8]; // quants, high bit
236
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
208
237
  } block_q5_K;
209
238
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
210
239
  #endif
@@ -220,7 +249,7 @@ typedef struct {
220
249
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
221
250
 
222
251
  #define WARP_SIZE 32
223
- #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
252
+ #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
224
253
 
225
254
  #define CUDA_ADD_BLOCK_SIZE 256
226
255
  #define CUDA_MUL_BLOCK_SIZE 256
@@ -233,6 +262,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
233
262
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
234
263
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
235
264
 
265
+ #ifndef GGML_CUDA_MMQ_Y
266
+ #define GGML_CUDA_MMQ_Y 64
267
+ #endif // GGML_CUDA_MMQ_Y
268
+
236
269
  // dmmv = dequantize_mul_mat_vec
237
270
  #ifndef GGML_CUDA_DMMV_X
238
271
  #define GGML_CUDA_DMMV_X 32
@@ -332,12 +365,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
332
365
  }
333
366
  }
334
367
 
335
- static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
368
+ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
336
369
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
337
370
  const int tid = threadIdx.x;
338
371
 
339
- const float eps = 1e-6f;
340
-
341
372
  float tmp = 0.0f; // partial sum for thread in warp
342
373
 
343
374
  for (int col = tid; col < ncols; col += WARP_SIZE) {
@@ -369,33 +400,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
369
400
  v.x = vui & 0xF;
370
401
  v.y = vui >> 4;
371
402
 
372
- #ifdef GGML_CUDA_DMMV_F16
403
+ #ifdef GGML_CUDA_F16
373
404
  v = __hsub2(v, {8.0f, 8.0f});
374
405
  v = __hmul2(v, {d, d});
375
406
  #else
376
407
  v.x = (v.x - 8.0f) * d;
377
408
  v.y = (v.y - 8.0f) * d;
378
- #endif // GGML_CUDA_DMMV_F16
409
+ #endif // GGML_CUDA_F16
379
410
  }
380
411
 
381
412
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
382
413
  const block_q4_1 * x = (const block_q4_1 *) vx;
383
414
 
384
- const dfloat d = x[ib].d;
385
- const dfloat m = x[ib].m;
415
+ const dfloat d = x[ib].dm.x;
416
+ const dfloat m = x[ib].dm.y;
386
417
 
387
418
  const int vui = x[ib].qs[iqs];
388
419
 
389
420
  v.x = vui & 0xF;
390
421
  v.y = vui >> 4;
391
422
 
392
- #ifdef GGML_CUDA_DMMV_F16
423
+ #ifdef GGML_CUDA_F16
393
424
  v = __hmul2(v, {d, d});
394
425
  v = __hadd2(v, {m, m});
395
426
  #else
396
427
  v.x = (v.x * d) + m;
397
428
  v.y = (v.y * d) + m;
398
- #endif // GGML_CUDA_DMMV_F16
429
+ #endif // GGML_CUDA_F16
399
430
  }
400
431
 
401
432
  static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
@@ -412,20 +443,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
412
443
  v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
413
444
  v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
414
445
 
415
- #ifdef GGML_CUDA_DMMV_F16
446
+ #ifdef GGML_CUDA_F16
416
447
  v = __hsub2(v, {16.0f, 16.0f});
417
448
  v = __hmul2(v, {d, d});
418
449
  #else
419
450
  v.x = (v.x - 16.0f) * d;
420
451
  v.y = (v.y - 16.0f) * d;
421
- #endif // GGML_CUDA_DMMV_F16
452
+ #endif // GGML_CUDA_F16
422
453
  }
423
454
 
424
455
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
425
456
  const block_q5_1 * x = (const block_q5_1 *) vx;
426
457
 
427
- const dfloat d = x[ib].d;
428
- const dfloat m = x[ib].m;
458
+ const dfloat d = x[ib].dm.x;
459
+ const dfloat m = x[ib].dm.y;
429
460
 
430
461
  uint32_t qh;
431
462
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -436,13 +467,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
436
467
  v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
437
468
  v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
438
469
 
439
- #ifdef GGML_CUDA_DMMV_F16
470
+ #ifdef GGML_CUDA_F16
440
471
  v = __hmul2(v, {d, d});
441
472
  v = __hadd2(v, {m, m});
442
473
  #else
443
474
  v.x = (v.x * d) + m;
444
475
  v.y = (v.y * d) + m;
445
- #endif // GGML_CUDA_DMMV_F16
476
+ #endif // GGML_CUDA_F16
446
477
  }
447
478
 
448
479
  static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
@@ -453,12 +484,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
453
484
  v.x = x[ib].qs[iqs + 0];
454
485
  v.y = x[ib].qs[iqs + 1];
455
486
 
456
- #ifdef GGML_CUDA_DMMV_F16
487
+ #ifdef GGML_CUDA_F16
457
488
  v = __hmul2(v, {d, d});
458
489
  #else
459
490
  v.x *= d;
460
491
  v.y *= d;
461
- #endif // GGML_CUDA_DMMV_F16
492
+ #endif // GGML_CUDA_F16
462
493
  }
463
494
 
464
495
  //================================== k-quants
@@ -477,8 +508,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
477
508
  const uint8_t q = x[i].qs[32*n + l];
478
509
  float * y = yy + i*QK_K + 128*n;
479
510
 
480
- float dall = x[i].d;
481
- float dmin = x[i].dmin;
511
+ float dall = x[i].dm.x;
512
+ float dmin = x[i].dm.y;
482
513
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
483
514
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
484
515
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -488,8 +519,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
488
519
  const int il = tid%16; // 0...15
489
520
  const uint8_t q = x[i].qs[il] >> (2*is);
490
521
  float * y = yy + i*QK_K + 16*is + il;
491
- float dall = x[i].d;
492
- float dmin = x[i].dmin;
522
+ float dall = x[i].dm.x;
523
+ float dmin = x[i].dm.y;
493
524
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
494
525
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
495
526
  #endif
@@ -575,8 +606,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
575
606
 
576
607
  float * y = yy + i*QK_K + 64*il + n*ir;
577
608
 
578
- const float dall = x[i].d;
579
- const float dmin = x[i].dmin;
609
+ const float dall = x[i].dm.x;
610
+ const float dmin = x[i].dm.y;
580
611
 
581
612
  const uint8_t * q = x[i].qs + 32*il + n*ir;
582
613
 
@@ -614,8 +645,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
614
645
 
615
646
  float * y = yy + i*QK_K + 64*il + 2*ir;
616
647
 
617
- const float dall = x[i].d;
618
- const float dmin = x[i].dmin;
648
+ const float dall = x[i].dm.x;
649
+ const float dmin = x[i].dm.y;
619
650
 
620
651
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
621
652
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -727,8 +758,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
727
758
  const float * y = yy + i * QK_K + y_offset;
728
759
  const uint8_t * q = x[i].qs + q_offset;
729
760
 
730
- const float dall = x[i].d;
731
- const float dmin = x[i].dmin;
761
+ const float dall = x[i].dm.x;
762
+ const float dmin = x[i].dm.y;
732
763
 
733
764
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
734
765
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -770,9 +801,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
770
801
  uaux[0] = s[0] & 0x0f0f0f0f;
771
802
  uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
772
803
 
773
- const half2 * dh = (const half2 *)&x[i].d;
774
-
775
- const float2 dall = __half22float2(dh[0]);
804
+ const float2 dall = __half22float2(x[i].dm);
776
805
 
777
806
  float sum1 = 0, sum2 = 0;
778
807
  for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
@@ -935,17 +964,23 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
935
964
  uint16_t aux[4];
936
965
  const uint8_t * sc = (const uint8_t *)aux;
937
966
 
967
+ #if K_QUANTS_PER_ITERATION == 2
968
+ uint32_t q32[4];
969
+ const uint8_t * q4 = (const uint8_t *)q32;
970
+ #else
971
+ uint16_t q16[4];
972
+ const uint8_t * q4 = (const uint8_t *)q16;
973
+ #endif
974
+
938
975
  float tmp = 0; // partial sum for thread in warp
939
976
 
940
977
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
941
978
 
942
- const uint8_t * q1 = x[i].qs + q_offset;
943
- const uint8_t * q2 = q1 + 64;
944
979
  const float * y1 = yy + i*QK_K + y_offset;
945
980
  const float * y2 = y1 + 128;
946
981
 
947
- const float dall = x[i].d;
948
- const float dmin = x[i].dmin;
982
+ const float dall = x[i].dm.x;
983
+ const float dmin = x[i].dm.y;
949
984
 
950
985
  const uint16_t * a = (const uint16_t *)x[i].scales;
951
986
  aux[0] = a[im+0] & kmask1;
@@ -953,14 +988,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
953
988
  aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
954
989
  aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
955
990
 
991
+ #if K_QUANTS_PER_ITERATION == 2
992
+ const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
993
+ const uint32_t * q2 = q1 + 16;
994
+
995
+ q32[0] = q1[0] & 0x0f0f0f0f;
996
+ q32[1] = q1[0] & 0xf0f0f0f0;
997
+ q32[2] = q2[0] & 0x0f0f0f0f;
998
+ q32[3] = q2[0] & 0xf0f0f0f0;
999
+
956
1000
  float4 s = {0.f, 0.f, 0.f, 0.f};
957
1001
  float smin = 0;
958
- for (int l = 0; l < n; ++l) {
959
- s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
960
- s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
1002
+ for (int l = 0; l < 4; ++l) {
1003
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
1004
+ s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
1005
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
1006
+ }
1007
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
1008
+ #else
1009
+ const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
1010
+ const uint16_t * q2 = q1 + 32;
1011
+
1012
+ q16[0] = q1[0] & 0x0f0f;
1013
+ q16[1] = q1[0] & 0xf0f0;
1014
+ q16[2] = q2[0] & 0x0f0f;
1015
+ q16[3] = q2[0] & 0xf0f0;
1016
+
1017
+ float4 s = {0.f, 0.f, 0.f, 0.f};
1018
+ float smin = 0;
1019
+ for (int l = 0; l < 2; ++l) {
1020
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
1021
+ s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
961
1022
  smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
962
1023
  }
963
- tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
1024
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
1025
+ #endif
964
1026
 
965
1027
  }
966
1028
  #else
@@ -1040,16 +1102,18 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1040
1102
  uint16_t aux[4];
1041
1103
  const uint8_t * sc = (const uint8_t *)aux;
1042
1104
 
1105
+ uint16_t q16[8];
1106
+ const uint8_t * q4 = (const uint8_t *)q16;
1107
+
1043
1108
  for (int i = ix; i < num_blocks_per_row; i += 2) {
1044
1109
 
1045
1110
  const uint8_t * ql1 = x[i].qs + q_offset;
1046
- const uint8_t * ql2 = ql1 + 64;
1047
1111
  const uint8_t * qh = x[i].qh + l0;
1048
1112
  const float * y1 = yy + i*QK_K + y_offset;
1049
1113
  const float * y2 = y1 + 128;
1050
1114
 
1051
- const float dall = x[i].d;
1052
- const float dmin = x[i].dmin;
1115
+ const float dall = x[i].dm.x;
1116
+ const float dmin = x[i].dm.y;
1053
1117
 
1054
1118
  const uint16_t * a = (const uint16_t *)x[i].scales;
1055
1119
  aux[0] = a[im+0] & kmask1;
@@ -1059,15 +1123,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1059
1123
 
1060
1124
  float4 sum = {0.f, 0.f, 0.f, 0.f};
1061
1125
  float smin = 0;
1126
+ const uint16_t * q1 = (const uint16_t *)ql1;
1127
+ const uint16_t * q2 = q1 + 32;
1128
+ q16[0] = q1[0] & 0x0f0f;
1129
+ q16[1] = q1[8] & 0x0f0f;
1130
+ q16[2] = (q1[0] >> 4) & 0x0f0f;
1131
+ q16[3] = (q1[8] >> 4) & 0x0f0f;
1132
+ q16[4] = q2[0] & 0x0f0f;
1133
+ q16[5] = q2[8] & 0x0f0f;
1134
+ q16[6] = (q2[0] >> 4) & 0x0f0f;
1135
+ q16[7] = (q2[8] >> 4) & 0x0f0f;
1062
1136
  for (int l = 0; l < n; ++l) {
1063
- sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1064
- + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1065
- sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1066
- + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1067
- sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1068
- + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1069
- sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1070
- + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1137
+ sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1138
+ + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1139
+ sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1140
+ + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1141
+ sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1142
+ + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1143
+ sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1144
+ + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1071
1145
  smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
1072
1146
  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
1073
1147
  }
@@ -1227,19 +1301,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1227
1301
  v.y = x[ib + iqs + 1];
1228
1302
  }
1229
1303
 
1230
- static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1231
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
1304
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1305
+ const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1232
1306
 
1233
- if (i >= k) {
1307
+ if (ix >= kx_padded) {
1234
1308
  return;
1235
1309
  }
1236
1310
 
1311
+ const int iy = blockDim.y*blockIdx.y + threadIdx.y;
1312
+
1313
+ const int i_padded = iy*kx_padded + ix;
1314
+
1237
1315
  block_q8_1 * y = (block_q8_1 *) vy;
1238
1316
 
1239
- const int ib = i / QK8_1; // block index
1240
- const int iqs = i % QK8_1; // quant index
1317
+ const int ib = i_padded / QK8_1; // block index
1318
+ const int iqs = i_padded % QK8_1; // quant index
1241
1319
 
1242
- const float xi = i < ndata ? x[i] : 0.0f;
1320
+ const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
1243
1321
  float amax = fabsf(xi);
1244
1322
  float sum = xi;
1245
1323
 
@@ -1258,8 +1336,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1258
1336
  return;
1259
1337
  }
1260
1338
 
1261
- y[ib].d = d;
1262
- y[ib].s = sum;
1339
+ y[ib].ds.x = d;
1340
+ y[ib].ds.y = sum;
1263
1341
  }
1264
1342
 
1265
1343
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -1283,363 +1361,1816 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
1283
1361
  y[iybs + iqs + y_offset] = v.y;
1284
1362
  }
1285
1363
 
1286
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1287
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1288
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1289
- const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1364
+ // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
1365
+ // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
1366
+
1367
+ #define VDR_Q4_0_Q8_1_MMVQ 2
1368
+ #define VDR_Q4_0_Q8_1_MMQ 4
1290
1369
 
1291
- int vi;
1292
- memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1293
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1294
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1370
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
1371
+ const int * v, const int * u, const float & d4, const half2 & ds8) {
1295
1372
 
1296
- const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
1373
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1374
+ int sumi = 0;
1297
1375
 
1298
- // subtract 8 from each quantized value
1299
- const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
1300
- const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
1376
+ #pragma unroll
1377
+ for (int i = 0; i < vdr; ++i) {
1378
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
1379
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
1301
1380
 
1302
- // SIMD dot product of quantized values
1303
- int sumi = __dp4a(vi0, ui0, 0);
1304
- sumi = __dp4a(vi1, ui1, sumi);
1381
+ // SIMD dot product of quantized values
1382
+ sumi = __dp4a(vi0, u[2*i+0], sumi);
1383
+ sumi = __dp4a(vi1, u[2*i+1], sumi);
1384
+ }
1305
1385
 
1306
- return sumi*d;
1386
+ // second part effectively subtracts 8 from each quant value
1387
+ return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y));
1307
1388
  #else
1308
1389
  return 0.0f; // only to satisfy the compiler
1309
1390
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1310
1391
  }
1311
1392
 
1312
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1313
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1314
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1315
- const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1393
+ #define VDR_Q4_1_Q8_1_MMVQ 2
1394
+ #define VDR_Q4_1_Q8_1_MMQ 4
1316
1395
 
1317
- const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1318
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1319
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
1396
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
1397
+ const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
1398
+
1399
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1400
+ int sumi = 0;
1320
1401
 
1321
- const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
1322
- const float m = bq4_1->m;
1323
- const float s = bq8_1->s;
1402
+ #pragma unroll
1403
+ for (int i = 0; i < vdr; ++i) {
1404
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
1405
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
1324
1406
 
1325
- const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1326
- const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1407
+ // SIMD dot product of quantized values
1408
+ sumi = __dp4a(vi0, u[2*i+0], sumi);
1409
+ sumi = __dp4a(vi1, u[2*i+1], sumi);
1410
+ }
1327
1411
 
1328
- // SIMD dot product of quantized values
1329
- int sumi = __dp4a(vi0, ui0, 0);
1330
- sumi = __dp4a(vi1, ui1, sumi);
1412
+ #ifdef GGML_CUDA_F16
1413
+ const half2 tmp = __hmul2(dm4, ds8);
1414
+ const float d4d8 = __half2float(tmp.x);
1415
+ const float m4s8 = __half2float(tmp.y);
1416
+ #else
1417
+ const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
1418
+ const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
1419
+ #endif // GGML_CUDA_F16
1331
1420
 
1332
- return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1421
+ // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1422
+ return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1333
1423
  #else
1334
1424
  return 0.0f; // only to satisfy the compiler
1335
1425
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1336
1426
  }
1337
1427
 
1338
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1339
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1428
+ #define VDR_Q5_0_Q8_1_MMVQ 2
1429
+ #define VDR_Q5_0_Q8_1_MMQ 4
1430
+
1431
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
1432
+ const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
1433
+
1340
1434
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1341
- const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1435
+ int sumi = 0;
1436
+
1437
+ for (int i = 0; i < vdr; ++i) {
1438
+ int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1439
+ vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
1440
+ vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
1441
+ vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
1442
+ vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
1443
+ sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
1444
+
1445
+ int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
1446
+ vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
1447
+ vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
1448
+ vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
1449
+ vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
1450
+ sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1451
+ }
1342
1452
 
1343
- int qs;
1344
- memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1345
- const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1346
- const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1347
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1348
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
1349
-
1350
- const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
1351
-
1352
- int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1353
- vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1354
- vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1355
- vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1356
- vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1357
- vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
1358
- int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1359
-
1360
- int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1361
- vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1362
- vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1363
- vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1364
- vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1365
- vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
1366
- sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1367
-
1368
- return sumi*d;
1453
+ // second part effectively subtracts 16 from each quant value
1454
+ return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y));
1369
1455
  #else
1370
1456
  return 0.0f; // only to satisfy the compiler
1371
1457
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1372
1458
  }
1373
1459
 
1374
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1375
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1460
+ #define VDR_Q5_1_Q8_1_MMVQ 2
1461
+ #define VDR_Q5_1_Q8_1_MMQ 4
1462
+
1463
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
1464
+ const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
1465
+
1376
1466
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1377
- const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1467
+ int sumi = 0;
1468
+
1469
+ for (int i = 0; i < vdr; ++i) {
1470
+ int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1471
+ vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
1472
+ vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
1473
+ vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
1474
+ vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
1475
+ sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
1476
+
1477
+ int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
1478
+ vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
1479
+ vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
1480
+ vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
1481
+ vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
1482
+ sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1483
+ }
1484
+
1485
+ #ifdef GGML_CUDA_F16
1486
+ const half2 tmp = __hmul2(dm5, ds8);
1487
+ const float d5d8 = __half2float(tmp.x);
1488
+ const float m5s8 = __half2float(tmp.y);
1489
+ #else
1490
+ const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
1491
+ const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
1492
+ #endif // GGML_CUDA_F16
1493
+
1494
+ // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
1495
+ return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1378
1496
 
1379
- const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
1380
- const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
1381
- const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
1382
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1383
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
1384
-
1385
- const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
1386
- const float m = bq5_1->m;
1387
- const float s = bq8_1->s;
1388
-
1389
- int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1390
- vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1391
- vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1392
- vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1393
- vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1394
- int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1395
-
1396
- int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1397
- vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1398
- vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1399
- vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1400
- vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1401
- sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1402
-
1403
- return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1404
1497
  #else
1405
1498
  return 0.0f; // only to satisfy the compiler
1406
1499
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1407
1500
  }
1408
1501
 
1409
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1410
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1411
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1412
- const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1502
+ #define VDR_Q8_0_Q8_1_MMVQ 2
1503
+ #define VDR_Q8_0_Q8_1_MMQ 8
1413
1504
 
1414
- int vi;
1415
- memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1416
- const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1505
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
1506
+ const int * v, const int * u, const float & d8_0, const half2 & ds8_1) {
1417
1507
 
1418
- const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
1508
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1509
+ int sumi = 0;
1419
1510
 
1420
- // SIMD dot product of quantized values
1421
- int sumi = __dp4a(vi, ui, 0);
1511
+ for (int i = 0; i < vdr; ++i) {
1512
+ // SIMD dot product of quantized values
1513
+ sumi = __dp4a(v[i], u[i], sumi);
1514
+ }
1422
1515
 
1423
- return sumi*d;
1516
+ return sumi * d8_0 * __half2float(ds8_1.x);
1424
1517
  #else
1425
1518
  return 0.0f; // only to satisfy the compiler
1426
1519
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1427
1520
  }
1428
1521
 
1429
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1430
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1522
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
1523
+ const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
1431
1524
 
1432
1525
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1433
- const block_q2_K * bq2_K = (const block_q2_K *) vbq;
1526
+ int sumi = 0;
1434
1527
 
1435
- const int bq8_offset = QR2_K * (iqs / QI8_1);
1436
- const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1528
+ for (int i = 0; i < vdr; ++i) {
1529
+ // SIMD dot product of quantized values
1530
+ sumi = __dp4a(v[i], u[i], sumi);
1531
+ }
1437
1532
 
1438
- float sumf_d = 0.0f;
1439
- float sumf_m = 0.0f;
1533
+ #ifdef GGML_CUDA_F16
1534
+ const half2 tmp = __hmul2(dm8, ds8);
1535
+ const float d8d8 = __half2float(tmp.x);
1536
+ const float m8s8 = __half2float(tmp.y);
1537
+ #else
1538
+ const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x);
1539
+ const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y);
1540
+ #endif // GGML_CUDA_F16
1440
1541
 
1441
- const float d = bq2_K->d;
1442
- const float dmin = bq2_K->dmin;
1542
+ // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1543
+ return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1544
+ #else
1545
+ return 0.0f; // only to satisfy the compiler
1546
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1547
+ }
1443
1548
 
1444
- const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
1549
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1550
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1445
1551
 
1446
- for (int i = 0; i < QR2_K; ++i) {
1447
- const int sc = bq2_K->scales[scale_offset + 2*i];
1552
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1553
+
1554
+ int v[VDR_Q4_0_Q8_1_MMVQ];
1555
+ int u[2*VDR_Q4_0_Q8_1_MMVQ];
1556
+
1557
+ #pragma unroll
1558
+ for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
1559
+ v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
1560
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1561
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
1562
+ }
1563
+
1564
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
1565
+ }
1566
+
1567
+ static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1568
+
1569
+ __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
1570
+ __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
1571
+
1572
+ *x_ql = tile_x_qs;
1573
+ *x_dm = (half2 *) tile_x_d;
1574
+ }
1575
+
1576
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1577
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1578
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1579
+
1580
+ __builtin_assume(i_offset >= 0);
1581
+ __builtin_assume(i_offset < 8);
1582
+ __builtin_assume(k >= 0);
1583
+ __builtin_assume(k < WARP_SIZE);
1584
+
1585
+ const int kbx = k / QI4_0;
1586
+ const int kqsx = k % QI4_0;
1587
+
1588
+ const block_q4_0 * bx0 = (block_q4_0 *) vx;
1589
+
1590
+ float * x_dmf = (float *) x_dm;
1591
+
1592
+ #pragma unroll
1593
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1594
+ int i = i0 + i_offset;
1595
+
1596
+ if (need_check) {
1597
+ i = min(i, i_max);
1598
+ }
1599
+
1600
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
1601
+
1602
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
1603
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1604
+ }
1605
+
1606
+ // const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1607
+ // const int kbxd = k % blocks_per_tile_x_row;
1608
+
1609
+ // #pragma unroll
1610
+ // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
1611
+ // FIXME out-of-bounds
1612
+ // const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1613
+
1614
+ // if (i >= GGML_CUDA_MMQ_Y) {
1615
+ // return;
1616
+ // }
1617
+
1618
+ // const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1619
+
1620
+ // x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
1621
+ // }
1622
+ }
1623
+
1624
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1625
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1626
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1627
+
1628
+ __builtin_assume(i >= 0);
1629
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1630
+ __builtin_assume(j >= 0);
1631
+ __builtin_assume(j < WARP_SIZE);
1632
+ __builtin_assume(k >= 0);
1633
+ __builtin_assume(k < WARP_SIZE);
1634
+
1635
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1636
+ const float * x_dmf = (float *) x_dm;
1637
+
1638
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
1639
+
1640
+ #pragma unroll
1641
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
1642
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1643
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
1644
+ }
1645
+
1646
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
1647
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
1648
+ y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1649
+ }
1650
+
1651
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1652
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1653
+
1654
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1655
+
1656
+ int v[VDR_Q4_1_Q8_1_MMVQ];
1657
+ int u[2*VDR_Q4_1_Q8_1_MMVQ];
1658
+
1659
+ #pragma unroll
1660
+ for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
1661
+ v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
1662
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1663
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
1664
+ }
1665
+
1666
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
1667
+ }
1668
+
1669
+ static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1670
+
1671
+ __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
1672
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
1673
+
1674
+ *x_ql = tile_x_qs;
1675
+ *x_dm = tile_x_dm;
1676
+ }
1677
+
1678
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
1679
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1680
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1681
+
1682
+ __builtin_assume(i_offset >= 0);
1683
+ __builtin_assume(i_offset < 8);
1684
+ __builtin_assume(k >= 0);
1685
+ __builtin_assume(k < WARP_SIZE);
1686
+
1687
+ const int kbx = k / QI4_1;
1688
+ const int kqsx = k % QI4_1;
1689
+
1690
+ const block_q4_1 * bx0 = (block_q4_1 *) vx;
1691
+
1692
+ #pragma unroll
1693
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1694
+ int i = i0 + i_offset;
1695
+
1696
+ if (need_check) {
1697
+ i = min(i, i_max);
1698
+ }
1699
+
1700
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
1701
+
1702
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
1703
+ }
1704
+
1705
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
1706
+ const int kbxd = k % blocks_per_tile_x_row;
1707
+
1708
+ #pragma unroll
1709
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
1710
+ int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
1711
+
1712
+ if (need_check) {
1713
+ i = min(i, i_max);
1714
+ }
1715
+
1716
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
1717
+
1718
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
1719
+ }
1720
+ }
1721
+
1722
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
1723
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1724
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1725
+
1726
+ __builtin_assume(i >= 0);
1727
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1728
+ __builtin_assume(j >= 0);
1729
+ __builtin_assume(j < WARP_SIZE);
1730
+ __builtin_assume(k >= 0);
1731
+ __builtin_assume(k < WARP_SIZE);
1732
+
1733
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1734
+
1735
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
1736
+
1737
+ #pragma unroll
1738
+ for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
1739
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1740
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
1741
+ }
1742
+
1743
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
1744
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
1745
+ y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1746
+ }
1747
+
1748
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1749
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1750
+
1751
+ const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1752
+
1753
+ int vl[VDR_Q5_0_Q8_1_MMVQ];
1754
+ int vh[VDR_Q5_0_Q8_1_MMVQ];
1755
+ int u[2*VDR_Q5_0_Q8_1_MMVQ];
1756
+
1757
+ #pragma unroll
1758
+ for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
1759
+ vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
1760
+ vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
1761
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1762
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
1763
+ }
1764
+
1765
+ return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
1766
+ }
1767
+
1768
+ static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1769
+
1770
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1771
+ __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
1772
+
1773
+ *x_ql = tile_x_ql;
1774
+ *x_dm = (half2 *) tile_x_d;
1775
+ }
1776
+
1777
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
1778
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1779
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1780
+
1781
+ __builtin_assume(i_offset >= 0);
1782
+ __builtin_assume(i_offset < 8);
1783
+ __builtin_assume(k >= 0);
1784
+ __builtin_assume(k < WARP_SIZE);
1785
+
1786
+ const int kbx = k / QI5_0;
1787
+ const int kqsx = k % QI5_0;
1788
+
1789
+ const block_q5_0 * bx0 = (block_q5_0 *) vx;
1790
+
1791
+ #pragma unroll
1792
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1793
+ int i = i0 + i_offset;
1794
+
1795
+ if (need_check) {
1796
+ i = min(i, i_max);
1797
+ }
1798
+
1799
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
1800
+
1801
+ const int ql = get_int_from_uint8(bxi->qs, kqsx);
1802
+ const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
1803
+
1804
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
1805
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
1806
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
1807
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
1808
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
1809
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
1810
+
1811
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
1812
+
1813
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
1814
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
1815
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
1816
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
1817
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
1818
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
1819
+
1820
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
1821
+ }
1822
+
1823
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
1824
+ const int kbxd = k % blocks_per_tile_x_row;
1825
+ float * x_dmf = (float *) x_dm;
1826
+
1827
+ #pragma unroll
1828
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
1829
+ int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
1830
+
1831
+ if (need_check) {
1832
+ i = min(i, i_max);
1833
+ }
1834
+
1835
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1836
+
1837
+ x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
1838
+ }
1839
+ }
1840
+
1841
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
1842
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1843
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1844
+
1845
+ __builtin_assume(i >= 0);
1846
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1847
+ __builtin_assume(j >= 0);
1848
+ __builtin_assume(j < WARP_SIZE);
1849
+ __builtin_assume(k >= 0);
1850
+ __builtin_assume(k < WARP_SIZE);
1851
+
1852
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1853
+ const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
1854
+ const float * x_dmf = (float *) x_dm;
1855
+
1856
+ int u[2*VDR_Q5_0_Q8_1_MMQ];
1857
+
1858
+ #pragma unroll
1859
+ for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
1860
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1861
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
1862
+ }
1863
+
1864
+ return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
1865
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1866
+ }
1867
+
1868
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1869
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1870
+
1871
+ const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1872
+
1873
+ int vl[VDR_Q5_1_Q8_1_MMVQ];
1874
+ int vh[VDR_Q5_1_Q8_1_MMVQ];
1875
+ int u[2*VDR_Q5_1_Q8_1_MMVQ];
1876
+
1877
+ #pragma unroll
1878
+ for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
1879
+ vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
1880
+ vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
1881
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1882
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
1883
+ }
1884
+
1885
+ return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
1886
+ }
1887
+
1888
+ static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1889
+
1890
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1891
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
1892
+
1893
+ *x_ql = tile_x_ql;
1894
+ *x_dm = tile_x_dm;
1895
+ }
1896
+
1897
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
1898
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1899
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1900
+
1901
+ __builtin_assume(i_offset >= 0);
1902
+ __builtin_assume(i_offset < 8);
1903
+ __builtin_assume(k >= 0);
1904
+ __builtin_assume(k < WARP_SIZE);
1905
+
1906
+ const int kbx = k / QI5_1;
1907
+ const int kqsx = k % QI5_1;
1908
+
1909
+ const block_q5_1 * bx0 = (block_q5_1 *) vx;
1910
+
1911
+ #pragma unroll
1912
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1913
+ int i = i0 + i_offset;
1914
+
1915
+ if (need_check) {
1916
+ i = min(i, i_max);
1917
+ }
1918
+
1919
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
1920
+
1921
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
1922
+ const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
1923
+
1924
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
1925
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
1926
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
1927
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
1928
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
1929
+
1930
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
1931
+
1932
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
1933
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
1934
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
1935
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
1936
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
1937
+
1938
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
1939
+ }
1940
+
1941
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
1942
+ const int kbxd = k % blocks_per_tile_x_row;
1943
+
1944
+ #pragma unroll
1945
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
1946
+ int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
1947
+
1948
+ if (need_check) {
1949
+ i = min(i, i_max);
1950
+ }
1951
+
1952
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
1953
+
1954
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
1955
+ }
1956
+ }
1957
+
1958
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1959
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1960
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1961
+
1962
+ __builtin_assume(i >= 0);
1963
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1964
+ __builtin_assume(j >= 0);
1965
+ __builtin_assume(j < WARP_SIZE);
1966
+ __builtin_assume(k >= 0);
1967
+ __builtin_assume(k < WARP_SIZE);
1968
+
1969
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1970
+ const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
1971
+
1972
+ int u[2*VDR_Q5_1_Q8_1_MMQ];
1973
+
1974
+ #pragma unroll
1975
+ for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
1976
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1977
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
1978
+ }
1979
+
1980
+ return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
1981
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1982
+ }
1983
+
1984
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1985
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1986
+
1987
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1988
+
1989
+ int v[VDR_Q8_0_Q8_1_MMVQ];
1990
+ int u[VDR_Q8_0_Q8_1_MMVQ];
1991
+
1992
+ for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
1993
+ v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
1994
+ u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1995
+ }
1996
+
1997
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
1998
+ }
1999
+
2000
+ static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2001
+
2002
+ __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2003
+ __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
2004
+
2005
+ *x_ql = tile_x_qs;
2006
+ *x_dm = (half2 *) tile_x_d;
2007
+ }
2008
+
2009
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2010
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2011
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2012
+
2013
+ __builtin_assume(i_offset >= 0);
2014
+ __builtin_assume(i_offset < 8);
2015
+ __builtin_assume(k >= 0);
2016
+ __builtin_assume(k < WARP_SIZE);
2017
+
2018
+ const int kbx = k / QI8_0;
2019
+ const int kqsx = k % QI8_0;
2020
+ float * x_dmf = (float *) x_dm;
2021
+
2022
+ const block_q8_0 * bx0 = (block_q8_0 *) vx;
2023
+
2024
+ #pragma unroll
2025
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2026
+ int i = i0 + i_offset;
2027
+
2028
+ if (need_check) {
2029
+ i = min(i, i_max);
2030
+ }
2031
+
2032
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
2033
+
2034
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
2035
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
2036
+ }
2037
+
2038
+ // const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2039
+ // const int kbxd = k % blocks_per_tile_x_row;
2040
+
2041
+ // #pragma unroll
2042
+ // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
2043
+ // FIXME out-of-bounds
2044
+ // const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2045
+
2046
+ // #if GGML_CUDA_MMQ_Y < 64
2047
+ // if (i >= GGML_CUDA_MMQ_Y) {
2048
+ // return;
2049
+ // }
2050
+ // #endif // GGML_CUDA_MMQ_Y < 64
2051
+
2052
+ // const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2053
+
2054
+ // x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
2055
+ // }
2056
+ }
2057
+
2058
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2059
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2060
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2061
+
2062
+ __builtin_assume(i >= 0);
2063
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2064
+ __builtin_assume(j >= 0);
2065
+ __builtin_assume(j < WARP_SIZE);
2066
+ __builtin_assume(k >= 0);
2067
+ __builtin_assume(k < WARP_SIZE);
2068
+
2069
+ const float * x_dmf = (float *) x_dm;
2070
+
2071
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
2072
+ (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
2073
+ y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2074
+ }
2075
+
2076
+ #define VDR_q2_K_q8_1 1
2077
+
2078
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
2079
+ const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2080
+ const half2 & dm, const float * __restrict__ d8) {
2081
+
2082
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2083
+ float sumf_d = 0.0f;
2084
+ float sumf_m = 0.0f;
2085
+
2086
+ for (int i = 0; i < QR2_K; ++i) {
2087
+ const int sc = scales[2*i];
2088
+
2089
+ const int vi = (v >> (2*i)) & 0x03030303;
2090
+
2091
+ sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
2092
+
2093
+ int sc_high = sc >> 4;
2094
+ sc_high |= sc_high << 8;
2095
+ sc_high |= sc_high << 16;
2096
+ sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
2097
+ }
2098
+
2099
+ const float2 dmf = __half22float2(dm);
2100
+
2101
+ return dmf.x*sumf_d - dmf.y*sumf_m;
2102
+ #else
2103
+ return 0.0f; // only to satisfy the compiler
2104
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2105
+ }
2106
+
2107
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2108
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2109
+
2110
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
2111
+
2112
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
2113
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2114
+
2115
+ const uint8_t * scales = bq2_K->scales + scale_offset;
2116
+
2117
+ const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
2118
+ int u[QR2_K];
2119
+ float d8[QR2_K];
2120
+
2121
+ for (int i = 0; i < QR2_K; ++ i) {
2122
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2123
+ d8[i] = bq8_1[bq8_offset + i].ds.x;
2124
+ }
2125
+
2126
+ return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8);
2127
+ }
2128
+
2129
+ static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2130
+
2131
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2132
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
2133
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2134
+
2135
+ *x_ql = tile_x_ql;
2136
+ *x_dm = tile_x_dm;
2137
+ *x_sc = tile_x_sc;
2138
+ }
2139
+
2140
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2141
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2142
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2143
+
2144
+ __builtin_assume(i_offset >= 0);
2145
+ __builtin_assume(i_offset < 8);
2146
+ __builtin_assume(k >= 0);
2147
+ __builtin_assume(k < WARP_SIZE);
2148
+
2149
+ const int kbx = k / QI2_K;
2150
+ const int kqsx = k % QI2_K;
2151
+
2152
+ const block_q2_K * bx0 = (block_q2_K *) vx;
2153
+
2154
+ #pragma unroll
2155
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2156
+ int i = i0 + i_offset;
2157
+
2158
+ if (need_check) {
2159
+ i = min(i, i_max);
2160
+ }
2161
+
2162
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
2163
+
2164
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2165
+ }
2166
+
2167
+ const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
2168
+ const int kbxd = k % blocks_per_tile_x_row;
2169
+
2170
+ #pragma unroll
2171
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
2172
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2173
+
2174
+ if (need_check) {
2175
+ i = min(i, i_max);
2176
+ }
2177
+
2178
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
2179
+
2180
+ x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
2181
+ }
2182
+
2183
+ #pragma unroll
2184
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2185
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2186
+
2187
+ if (need_check) {
2188
+ i = min(i, i_max);
2189
+ }
2190
+
2191
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
2192
+
2193
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
2194
+ }
2195
+ }
2196
+
2197
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2198
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2199
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2200
+
2201
+ __builtin_assume(i >= 0);
2202
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2203
+ __builtin_assume(j >= 0);
2204
+ __builtin_assume(j < WARP_SIZE);
2205
+ __builtin_assume(k >= 0);
2206
+ __builtin_assume(k < WARP_SIZE);
2207
+
2208
+ const int kbx = k / QI2_K;
2209
+ const int kqsx = k % QI2_K;
2210
+
2211
+ const int bq8_offset = QR2_K * (kqsx / QI8_1);
2212
+ const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2213
+
2214
+ const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset;
2215
+
2216
+ int u[QR2_K];
2217
+ float d8[QR2_K];
2218
+
2219
+ for (int l = 0; l < QR2_K; ++ l) {
2220
+ const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2221
+ u[l] = y_qs[y_qs_index];
2222
+ d8[l] = y_ds[y_qs_index / QI8_1].x;
2223
+ }
2224
+
2225
+ return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8);
2226
+ }
2227
+
2228
+ #define VDR_q3_K_q8_1 1
2229
+
2230
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
2231
+ const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2232
+ const int & scale_offset, const float & d, const float * __restrict__ d8) {
2233
+
2234
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2235
+ float sumf = 0.0f;
2236
+
2237
+ for (int i = 0; i < QR3_K; ++i) {
2238
+ const int isc = scale_offset + 2*i;
2239
+
2240
+ const int isc_low = isc % (QK_K/32);
2241
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
2242
+ const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
2243
+
2244
+ const int isc_high = isc % (QK_K/64);
2245
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
2246
+ const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
2247
+
2248
+ const int sc = (sc_low | sc_high) - 32;
2249
+
2250
+ const int vil = (vl >> (2*i)) & 0x03030303;
2251
+
2252
+ const int vih = ((vh >> i) << 2) & 0x04040404;
2253
+
2254
+ const int vi = __vsubss4(vil, vih);
2255
+
2256
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2257
+ }
2258
+
2259
+ return d*sumf;
2260
+ #else
2261
+ return 0.0f; // only to satisfy the compiler
2262
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2263
+ }
2264
+
2265
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2266
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2267
+
2268
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
2269
+
2270
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
2271
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2272
+
2273
+ const float d = bq3_K->d;
2274
+
2275
+ const int vl = get_int_from_uint8(bq3_K->qs, iqs);
2276
+
2277
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2278
+ const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
2279
+
2280
+ int u[QR3_K];
2281
+ float d8[QR3_K];
2282
+
2283
+ for (int i = 0; i < QR3_K; ++i) {
2284
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2285
+ d8[i] = bq8_1[bq8_offset + i].ds.x;
2286
+ }
2287
+
2288
+ return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2289
+ }
2290
+
2291
+ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2292
+
2293
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2294
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
2295
+ __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2296
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2297
+
2298
+ *x_ql = tile_x_ql;
2299
+ *x_dm = tile_x_dm;
2300
+ *x_qh = tile_x_qh;
2301
+ *x_sc = tile_x_sc;
2302
+ }
2303
+
2304
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2305
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2306
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2307
+
2308
+ __builtin_assume(i_offset >= 0);
2309
+ __builtin_assume(i_offset < 8);
2310
+ __builtin_assume(k >= 0);
2311
+ __builtin_assume(k < WARP_SIZE);
2312
+
2313
+ const int kbx = k / QI3_K;
2314
+ const int kqsx = k % QI3_K;
2315
+
2316
+ const block_q3_K * bx0 = (block_q3_K *) vx;
2317
+
2318
+ #pragma unroll
2319
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2320
+ int i = i0 + i_offset;
2321
+
2322
+ if (need_check) {
2323
+ i = min(i, i_max);
2324
+ }
2325
+
2326
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
2327
+
2328
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
2329
+ }
2330
+
2331
+ const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
2332
+ const int kbxd = k % blocks_per_tile_x_row;
2333
+
2334
+ #pragma unroll
2335
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
2336
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2337
+
2338
+ if (need_check) {
2339
+ i = min(i, i_max);
2340
+ }
2341
+
2342
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
2343
+
2344
+ x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d;
2345
+ }
2346
+
2347
+ #pragma unroll
2348
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
2349
+ int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
2350
+
2351
+ if (need_check) {
2352
+ i = min(i, i_max);
2353
+ }
2354
+
2355
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
2356
+
2357
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2358
+ }
2359
+
2360
+ #pragma unroll
2361
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2362
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2363
+
2364
+ if (need_check) {
2365
+ i = min(i, i_max);
2366
+ }
2367
+
2368
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
2369
+
2370
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4));
2371
+ }
2372
+ }
2373
+
2374
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2375
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2376
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2377
+
2378
+ __builtin_assume(i >= 0);
2379
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2380
+ __builtin_assume(j >= 0);
2381
+ __builtin_assume(j < WARP_SIZE);
2382
+ __builtin_assume(k >= 0);
2383
+ __builtin_assume(k < WARP_SIZE);
2384
+
2385
+ const int kbx = k / QI3_K;
2386
+ const int kqsx = k % QI3_K;
2387
+
2388
+ const int bq8_offset = QR3_K * (kqsx / (QI3_K/2));
2389
+ const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2390
+
2391
+ const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
2392
+
2393
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2394
+ const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
2395
+
2396
+ int u[QR3_K];
2397
+ float d8[QR3_K];
2398
+
2399
+ for (int l = 0; l < QR3_K; ++ l) {
2400
+ const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2401
+ u[l] = y_qs[y_qs_index];
2402
+ d8[l] = y_ds[y_qs_index / QI8_1].x;
2403
+ }
2404
+
2405
+ return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
2406
+ x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
2407
+ }
2408
+
2409
+ #define VDR_q4_K_q8_1 2
2410
+
2411
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
2412
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2413
+ const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
2414
+
2415
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2416
+ float sumf_d = 0.0f;
2417
+ float sumf_m = 0.0f;
2418
+
2419
+ for (int i = 0; i < QR4_K; ++i) {
2420
+ const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
2421
+ const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
2422
+
2423
+ const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
2424
+ const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
2425
+
2426
+ sumf_d += d8[i] * (dot1 * sc[i]);
2427
+ sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
2428
+ }
2429
+
2430
+ return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m;
2431
+
2432
+ #else
2433
+ return 0.0f; // only to satisfy the compiler
2434
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2435
+ }
2436
+
2437
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2438
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2439
+
2440
+ #ifndef GGML_QKK_64
2441
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
2442
+
2443
+ int v[2];
2444
+ int u[2*QR4_K];
2445
+ float d8[QR4_K];
2446
+
2447
+ // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
2448
+ const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
2449
+
2450
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
2451
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
2452
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
2453
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
2454
+
2455
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
2456
+ v[0] = q4[0];
2457
+ v[1] = q4[4];
2458
+
2459
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales;
2460
+ uint16_t aux[2];
2461
+ const int j = bq8_offset/2;
2462
+ if (j < 2) {
2463
+ aux[0] = scales[j+0] & 0x3f3f;
2464
+ aux[1] = scales[j+2] & 0x3f3f;
2465
+ } else {
2466
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
2467
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
2468
+ }
2469
+ const uint8_t * sc = (const uint8_t *)aux;
2470
+ const uint8_t * m = sc + 2;
2471
+
2472
+ for (int i = 0; i < QR4_K; ++i) {
2473
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2474
+ d8[i] = bq8i->ds.x;
2475
+
2476
+ const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2477
+ u[2*i+0] = q8[0];
2478
+ u[2*i+1] = q8[4];
2479
+ }
2480
+
2481
+ return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8);
2482
+
2483
+ #else
2484
+
2485
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2486
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
2487
+
2488
+ float sumf_d = 0.0f;
2489
+ float sumf_m = 0.0f;
2490
+
2491
+ uint16_t aux16[2];
2492
+ const uint8_t * s = (const uint8_t *)aux16;
2493
+
2494
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
2495
+ aux16[0] = a[0] & 0x0f0f;
2496
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
2497
+
2498
+ const float dall = bq4_K->d[0];
2499
+ const float dmin = bq4_K->d[1];
2500
+
2501
+ const float d8_1 = bq8_1[0].ds.x;
2502
+ const float d8_2 = bq8_1[1].ds.x;
2503
+
2504
+ const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2505
+ const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
2506
+ const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
2507
+ const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
2508
+
2509
+ const int * q4 = (const int *)bq4_K->qs + (iqs/2);
2510
+ const int v1 = q4[0];
2511
+ const int v2 = q4[4];
2512
+
2513
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
2514
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
2515
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
2516
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
2517
+
2518
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
2519
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
2520
+
2521
+ return dall * sumf_d - dmin * sumf_m;
2522
+
2523
+ #else
2524
+ return 0.0f; // only to satisfy the compiler
2525
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2526
+
2527
+ #endif
2528
+ }
2529
+
2530
+ static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2531
+
2532
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2533
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
2534
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2535
+
2536
+ *x_ql = tile_x_ql;
2537
+ *x_dm = tile_x_dm;
2538
+ *x_sc = tile_x_sc;
2539
+ }
2540
+
2541
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2542
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2543
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2544
+
2545
+ __builtin_assume(i_offset >= 0);
2546
+ __builtin_assume(i_offset < 8);
2547
+ __builtin_assume(k >= 0);
2548
+ __builtin_assume(k < WARP_SIZE);
2549
+
2550
+ const int kbx = k / QI4_K; // == 0 if QK_K == 256
2551
+ const int kqsx = k % QI4_K; // == k if QK_K == 256
2552
+
2553
+ const block_q4_K * bx0 = (block_q4_K *) vx;
2554
+
2555
+ #pragma unroll
2556
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2557
+ int i = i0 + i_offset;
2558
+
2559
+ if (need_check) {
2560
+ i = min(i, i_max);
2561
+ }
2562
+
2563
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
2564
+
2565
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2566
+ }
2567
+
2568
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2569
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2570
+
2571
+ #pragma unroll
2572
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
2573
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2574
+
2575
+ if (need_check) {
2576
+ i = min(i, i_max);
2577
+ }
2578
+
2579
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2580
+
2581
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2582
+ }
2583
+
2584
+ #pragma unroll
2585
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2586
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2587
+
2588
+ if (need_check) {
2589
+ i = min(i, i_max);
2590
+ }
2591
+
2592
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2593
+
2594
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
2595
+ }
2596
+ }
2597
+
2598
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2599
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2600
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2601
+
2602
+ __builtin_assume(i >= 0);
2603
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2604
+ __builtin_assume(j >= 0);
2605
+ __builtin_assume(j < WARP_SIZE);
2606
+ __builtin_assume(k >= 0);
2607
+ __builtin_assume(k < WARP_SIZE);
2608
+
2609
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
2610
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
2611
+
2612
+ int v[2];
2613
+ int u[2*QR4_K];
2614
+ float d8[QR4_K];
2615
+
2616
+ // kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6
2617
+ const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
2618
+
2619
+ v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2620
+ v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2621
+
2622
+ const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2623
+ uint16_t aux[2];
2624
+ const int l = bq8_offset/2;
2625
+ if (l < 2) {
2626
+ aux[0] = scales[l+0] & 0x3f3f;
2627
+ aux[1] = scales[l+2] & 0x3f3f;
2628
+ } else {
2629
+ aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2630
+ aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2631
+ }
2632
+ const uint8_t * sc = (const uint8_t *)aux;
2633
+ const uint8_t * m = sc + 2;
2634
+
2635
+ for (int l = 0; l < QR4_K; ++l) {
2636
+ const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2637
+ u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2638
+ u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2639
+ d8[l] = y_ds[kqsy / QI8_1].x;
2640
+ }
2641
+
2642
+ return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
2643
+ }
2644
+
2645
+ #define VDR_q5_K_q8_1 2
2646
+
2647
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
2648
+ const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2649
+ const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
2650
+
2651
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2652
+ float sumf_d = 0.0f;
2653
+ float sumf_m = 0.0f;
2654
+
2655
+ for (int i = 0; i < QR5_K; ++i) {
2656
+ const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
2657
+ const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
2658
+
2659
+ const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
2660
+ const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
2661
+
2662
+ const int v0i = vl0i | vh0i;
2663
+ const int v1i = vl1i | vh1i;
2664
+
2665
+ const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
2666
+ const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
2667
+
2668
+ sumf_d += d8[i] * (dot1 * sc[i]);
2669
+ sumf_m += d8[i] * (dot2 * m[i]);
2670
+
2671
+ }
2672
+
2673
+ return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m;
2674
+
2675
+ #else
2676
+ return 0.0f; // only to satisfy the compiler
2677
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2678
+ }
2679
+
2680
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2681
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2682
+
2683
+ #ifndef GGML_QKK_64
2684
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
2685
+
2686
+ int vl[2];
2687
+ int vh[2];
2688
+ int u[2*QR5_K];
2689
+ float d8[QR5_K];
2690
+
2691
+ const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
2692
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
2693
+ const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
2694
+
2695
+ vl[0] = ql[0];
2696
+ vl[1] = ql[4];
2697
+
2698
+ vh[0] = qh[0] >> bq8_offset;
2699
+ vh[1] = qh[4] >> bq8_offset;
2700
+
2701
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales;
2702
+ uint16_t aux[2];
2703
+ const int j = bq8_offset/2;
2704
+ if (j < 2) {
2705
+ aux[0] = scales[j+0] & 0x3f3f;
2706
+ aux[1] = scales[j+2] & 0x3f3f;
2707
+ } else {
2708
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
2709
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
2710
+ }
2711
+ const uint8_t * sc = (const uint8_t *)aux;
2712
+ const uint8_t * m = sc + 2;
2713
+
2714
+ for (int i = 0; i < QR5_K; ++i) {
2715
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2716
+ d8[i] = bq8i->ds.x;
2717
+
2718
+ const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2719
+ u[2*i+0] = q8[0];
2720
+ u[2*i+1] = q8[4];
2721
+ }
2722
+
2723
+ return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
2724
+
2725
+ #else
2726
+
2727
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2728
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
2729
+
2730
+ const int8_t * s = bq5_K->scales;
2731
+
2732
+ const float d = bq5_K->d;
2733
+
2734
+ const float d8_1 = bq8_1[0].ds.x;
2735
+ const float d8_2 = bq8_1[1].ds.x;
2736
+
2737
+ const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2738
+ const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
2739
+ const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
2740
+ const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
2741
+
2742
+ const int * ql = (const int *)bq5_K->qs + (iqs/2);
2743
+ const int vl1 = ql[0];
2744
+ const int vl2 = ql[4];
2745
+
2746
+ const int step = 4 * (iqs/2); // 0, 4, 8, 12
2747
+ const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
2748
+ const int in = step%8; // 0, 4, 0, 4
2749
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
2750
+
2751
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
2752
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
2753
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
2754
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
2755
+
2756
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
2757
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
2758
+
2759
+ return d * sumf_d;
2760
+
2761
+ #else
2762
+ return 0.0f; // only to satisfy the compiler
2763
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2764
+
2765
+ #endif
2766
+ }
2767
+
2768
+ static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2769
+
2770
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2771
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
2772
+ __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2773
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2774
+
2775
+ *x_ql = tile_x_ql;
2776
+ *x_dm = tile_x_dm;
2777
+ *x_qh = tile_x_qh;
2778
+ *x_sc = tile_x_sc;
2779
+ }
2780
+
2781
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2782
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2783
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2784
+
2785
+ __builtin_assume(i_offset >= 0);
2786
+ __builtin_assume(i_offset < 8);
2787
+ __builtin_assume(k >= 0);
2788
+ __builtin_assume(k < WARP_SIZE);
2789
+
2790
+ const int kbx = k / QI5_K; // == 0 if QK_K == 256
2791
+ const int kqsx = k % QI5_K; // == k if QK_K == 256
2792
+
2793
+ const block_q5_K * bx0 = (block_q5_K *) vx;
2794
+
2795
+ #pragma unroll
2796
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2797
+ int i = i0 + i_offset;
2798
+
2799
+ if (need_check) {
2800
+ i = min(i, i_max);
2801
+ }
2802
+
2803
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
2804
+
2805
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2806
+ }
2807
+
2808
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
2809
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2810
+
2811
+ #pragma unroll
2812
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
2813
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2814
+
2815
+ if (need_check) {
2816
+ i = min(i, i_max);
2817
+ }
2818
+
2819
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
2820
+
2821
+ x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
2822
+ }
2823
+
2824
+ #pragma unroll
2825
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2826
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2827
+
2828
+ if (need_check) {
2829
+ i = min(i, i_max);
2830
+ }
2831
+
2832
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
2833
+
2834
+ x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
2835
+ }
2836
+
2837
+ #pragma unroll
2838
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2839
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2840
+
2841
+ if (need_check) {
2842
+ i = min(i, i_max);
2843
+ }
2844
+
2845
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2846
+
2847
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
2848
+ }
2849
+ }
2850
+
2851
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
2852
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2853
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2854
+
2855
+ __builtin_assume(i >= 0);
2856
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2857
+ __builtin_assume(j >= 0);
2858
+ __builtin_assume(j < WARP_SIZE);
2859
+ __builtin_assume(k >= 0);
2860
+ __builtin_assume(k < WARP_SIZE);
2861
+
2862
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
2863
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
2864
+
2865
+ int vl[2];
2866
+ int vh[2];
2867
+ int u[2*QR4_K];
2868
+ float d8[QR4_K];
2869
+
2870
+ const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
2871
+
2872
+ vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2873
+ vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2874
+
2875
+ vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
2876
+ vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
2877
+
2878
+ const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2879
+ uint16_t aux[2];
2880
+ const int l = bq8_offset/2;
2881
+ if (l < 2) {
2882
+ aux[0] = scales[l+0] & 0x3f3f;
2883
+ aux[1] = scales[l+2] & 0x3f3f;
2884
+ } else {
2885
+ aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2886
+ aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2887
+ }
2888
+ const uint8_t * sc = (const uint8_t *)aux;
2889
+ const uint8_t * m = sc + 2;
2890
+
2891
+ for (int l = 0; l < QR5_K; ++l) {
2892
+ const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2893
+ u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2894
+ u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2895
+ d8[l] = y_ds[kqsy / QI8_1].x;
2896
+ }
2897
+
2898
+ return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
2899
+ }
2900
+
2901
+ #define VDR_q6_K_q8_1 1
2902
+
2903
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
2904
+ const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
2905
+ const float & d, const float * __restrict__ d8) {
2906
+
2907
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2908
+ float sumf = 0.0f;
2909
+
2910
+ for (int i = 0; i < QR6_K; ++i) {
2911
+ const int sc = scales[4*i];
2912
+
2913
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
2914
+
2915
+ const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
2916
+
2917
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
2918
+
2919
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2920
+ }
2921
+
2922
+ return d*sumf;
2923
+ #else
2924
+ return 0.0f; // only to satisfy the compiler
2925
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2926
+ }
2927
+
2928
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
2929
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2930
+
2931
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
2932
+
2933
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
2934
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
2935
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
2936
+
2937
+ const int vl = get_int_from_uint8(bq6_K->ql, iqs);
2938
+ const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
2939
+
2940
+ const int8_t * scales = bq6_K->scales + scale_offset;
2941
+
2942
+ int u[QR6_K];
2943
+ float d8[QR6_K];
2944
+
2945
+ for (int i = 0; i < QR6_K; ++i) {
2946
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
2947
+ d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
2948
+ }
1448
2949
 
1449
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1450
- const float d8i = bq8i->d;
2950
+ return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8);
2951
+ }
1451
2952
 
1452
- const int vi = (v >> (2*i)) & 0x03030303;
1453
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
2953
+ static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1454
2954
 
1455
- sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
1456
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
1457
- }
2955
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2956
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
2957
+ __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2958
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
1458
2959
 
1459
- return d*sumf_d - dmin*sumf_m;
1460
- #else
1461
- return 0.0f; // only to satisfy the compiler
1462
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2960
+ *x_ql = tile_x_ql;
2961
+ *x_dm = tile_x_dm;
2962
+ *x_qh = tile_x_qh;
2963
+ *x_sc = tile_x_sc;
1463
2964
  }
1464
2965
 
1465
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1466
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
2966
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
2967
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2968
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1467
2969
 
1468
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1469
- const block_q3_K * bq3_K = (const block_q3_K *) vbq;
2970
+ __builtin_assume(i_offset >= 0);
2971
+ __builtin_assume(i_offset < 8);
2972
+ __builtin_assume(k >= 0);
2973
+ __builtin_assume(k < WARP_SIZE);
1470
2974
 
1471
- const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
1472
- const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2975
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
2976
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
1473
2977
 
1474
- float sumf = 0.0f;
2978
+ const block_q6_K * bx0 = (block_q6_K *) vx;
1475
2979
 
1476
- const float d = bq3_K->d;
2980
+ #pragma unroll
2981
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2982
+ int i = i0 + i_offset;
1477
2983
 
1478
- int vl;
1479
- memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
2984
+ if (need_check) {
2985
+ i = min(i, i_max);
2986
+ }
1480
2987
 
1481
- int vh;
1482
- memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
1483
- vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
1484
- vh >>= bq8_offset;
2988
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
1485
2989
 
1486
- for (int i = 0; i < QR3_K; ++i) {
1487
- const int isc = scale_offset + 2*i;
2990
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx);
2991
+ }
1488
2992
 
1489
- const int isc_low = isc % (QK_K/32);
1490
- const int sc_shift_low = 4 * (isc / (QK_K/32));
1491
- const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
2993
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
2994
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
1492
2995
 
1493
- const int isc_high = isc % (QK_K/64);
1494
- const int sc_shift_high = 2 * (isc / (QK_K/64));
1495
- const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
2996
+ #pragma unroll
2997
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
2998
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
1496
2999
 
1497
- const int sc = (sc_low | sc_high) - 32;
3000
+ if (need_check) {
3001
+ i = min(i, i_max);
3002
+ }
1498
3003
 
1499
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1500
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1501
- const float d8i = bq8i->d;
3004
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
1502
3005
 
1503
- const int vil = (vl >> (2*i)) & 0x03030303;
3006
+ x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d;
3007
+ }
1504
3008
 
1505
- const int vih = ((vh >> i) << 2) & 0x04040404;
3009
+ #pragma unroll
3010
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
3011
+ int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
1506
3012
 
1507
- const int vi = __vsubss4(vil, vih);
3013
+ if (need_check) {
3014
+ i = min(i, i_max);
3015
+ }
3016
+
3017
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2);
1508
3018
 
1509
- sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
3019
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2));
1510
3020
  }
1511
3021
 
1512
- return d*sumf;
1513
- #else
1514
- return 0.0f; // only to satisfy the compiler
1515
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1516
- }
3022
+ #pragma unroll
3023
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
3024
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
1517
3025
 
1518
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1519
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
3026
+ if (need_check) {
3027
+ i = min(i, i_max);
3028
+ }
1520
3029
 
1521
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1522
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
3030
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
1523
3031
 
1524
- const int bq8_offset = QR4_K * (iqs / QI8_1);
3032
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
3033
+ }
3034
+ }
1525
3035
 
1526
- float sumf_d = 0.0f;
1527
- float sumf_m = 0.0f;
3036
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3037
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3038
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1528
3039
 
1529
- const float d = bq4_K->d;
1530
- const float dmin = bq4_K->dmin;
3040
+ __builtin_assume(i >= 0);
3041
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
3042
+ __builtin_assume(j >= 0);
3043
+ __builtin_assume(j < WARP_SIZE);
3044
+ __builtin_assume(k >= 0);
3045
+ __builtin_assume(k < WARP_SIZE);
1531
3046
 
1532
- const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
3047
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
3048
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
1533
3049
 
1534
- for (int i = 0; i < QR4_K; ++i) {
1535
- const int isc = bq8_offset + i;
3050
+ const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
3051
+ const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
3052
+ const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
1536
3053
 
1537
- uint8_t sc, m;
1538
- get_scale_min_k4(isc, bq4_K->scales, sc, m);
3054
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
1539
3055
 
1540
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1541
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1542
- const float d8i = bq8i->d;
3056
+ const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
3057
+ const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
1543
3058
 
1544
- const int vi = (v >> (4*i)) & 0x0F0F0F0F;
3059
+ int u[QR6_K];
3060
+ float d8[QR6_K];
1545
3061
 
1546
- sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1547
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
3062
+ for (int l = 0; l < QR6_K; ++l) {
3063
+ const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
3064
+ u[l] = y_qs[kqsy];
3065
+ d8[l] = y_ds[kqsy / QI8_1].x;
1548
3066
  }
1549
3067
 
1550
- return d*sumf_d - dmin*sumf_m;
1551
- #else
1552
- return 0.0f; // only to satisfy the compiler
1553
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3068
+ return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
3069
+ x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
1554
3070
  }
1555
3071
 
1556
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1557
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1558
-
1559
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1560
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
3072
+ template <int qk, int qr, int qi, typename block_q_t,
3073
+ allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3074
+ static __global__ void mul_mat_q(
3075
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3076
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1561
3077
 
1562
- const int bq8_offset = QR5_K * (iqs / QI8_1);
3078
+ const block_q_t * x = (const block_q_t *) vx;
3079
+ const block_q8_1 * y = (const block_q8_1 *) vy;
1563
3080
 
1564
- float sumf_d = 0.0f;
1565
- float sumf_m = 0.0f;
3081
+ const int blocks_per_row_x = ncols_x / qk;
3082
+ const int blocks_per_col_y = nrows_y / QK8_1;
3083
+ const int blocks_per_warp = WARP_SIZE / qi;
1566
3084
 
1567
- const float d = bq5_K->d;
1568
- const float dmin = bq5_K->dmin;
3085
+ const int & ncols_dst = ncols_y;
1569
3086
 
1570
- const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
3087
+ const int tid_x = threadIdx.x;
3088
+ const int tid_y = threadIdx.y;
1571
3089
 
1572
- const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
3090
+ const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
3091
+ const int & row_x_0 = row_dst_0;
3092
+ const int row_dst = row_dst_0 + tid_x;
1573
3093
 
1574
- for (int i = 0; i < QR5_K; ++i) {
1575
- const int isc = bq8_offset + i;
3094
+ const int col_dst_0 = blockIdx.y*WARP_SIZE;
3095
+ const int & col_y_0 = col_dst_0;
1576
3096
 
1577
- uint8_t sc, m;
1578
- get_scale_min_k4(isc, bq5_K->scales, sc, m);
3097
+ int * tile_x_ql = nullptr;
3098
+ half2 * tile_x_dm = nullptr;
3099
+ int * tile_x_qh = nullptr;
3100
+ int * tile_x_sc = nullptr;
1579
3101
 
1580
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1581
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1582
- const float d8i = bq8i->d;
3102
+ allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
1583
3103
 
1584
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
3104
+ const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
1585
3105
 
1586
- const int vih = ((vh >> i) << 4) & 0x10101010;
3106
+ __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
3107
+ __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
1587
3108
 
1588
- const int vi = vil | vih;
3109
+ float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
1589
3110
 
1590
- sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1591
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
1592
- }
3111
+ for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
1593
3112
 
1594
- return d*sumf_d - dmin*sumf_m;
1595
- #else
1596
- return 0.0f; // only to satisfy the compiler
1597
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1598
- }
3113
+ load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
3114
+ tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
1599
3115
 
1600
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1601
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
3116
+ for (int ir = 0; ir < qr; ++ir) {
3117
+ const int kqs = ir*WARP_SIZE + tid_x;
3118
+ const int kbxd = kqs / QI8_1;
1602
3119
 
1603
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1604
- const block_q6_K * bq6_K = (const block_q6_K *) vbq;
3120
+ for (int i = 0; i < WARP_SIZE; i += 8) {
3121
+ const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
1605
3122
 
1606
- const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1607
- const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1608
- const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
3123
+ const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
1609
3124
 
1610
- float sumf = 0.0f;
3125
+ tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
3126
+ }
3127
+ }
1611
3128
 
1612
- const float d = bq6_K->d;
3129
+ for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
3130
+ const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
3131
+ const int kby = tid_x % blocks_per_tile_y_col;
3132
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3133
+ tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
3134
+ }
1613
3135
 
1614
- int vl;
1615
- memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
3136
+ __syncthreads();
1616
3137
 
1617
- int vh;
1618
- memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
3138
+ #if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
3139
+ #pragma unroll
3140
+ #endif // __CUDA_ARCH__ >= 700
3141
+ for (int k = 0; k < WARP_SIZE; k += vdr) {
3142
+ #pragma unroll
3143
+ for (int j = 0; j < WARP_SIZE; j += 8) {
3144
+ #pragma unroll
3145
+ for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3146
+ sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3147
+ tid_x + i, tid_y + j, k);
3148
+ }
3149
+ }
3150
+ }
1619
3151
 
1620
- for (int i = 0; i < QR6_K; ++i) {
1621
- const int sc = bq6_K->scales[scale_offset + 4*i];
3152
+ __syncthreads();
3153
+ }
1622
3154
 
1623
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
1624
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
1625
- const float d8i = bq8i->d;
1626
3155
 
1627
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
3156
+ if (row_dst >= nrows_dst) {
3157
+ return;
3158
+ }
1628
3159
 
1629
- const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
3160
+ for (int j = 0; j < WARP_SIZE; j += 8) {
3161
+ const int col_dst = col_dst_0 + j + tid_y;
1630
3162
 
1631
- const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
3163
+ if (col_dst >= ncols_dst) {
3164
+ return;
3165
+ }
1632
3166
 
1633
- sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
3167
+ for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3168
+ dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
3169
+ }
1634
3170
  }
1635
-
1636
- return d*sumf;
1637
- #else
1638
- return 0.0f; // only to satisfy the compiler
1639
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1640
3171
  }
1641
3172
 
1642
- template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
3173
+ template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
1643
3174
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1644
3175
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1645
3176
 
@@ -1648,7 +3179,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1648
3179
  }
1649
3180
 
1650
3181
  const int blocks_per_row = ncols / qk;
1651
- const int blocks_per_warp = WARP_SIZE / qi;
3182
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
1652
3183
 
1653
3184
  // partial sum for each thread
1654
3185
  float tmp = 0.0f;
@@ -1657,11 +3188,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1657
3188
  const block_q8_1 * y = (const block_q8_1 *) vy;
1658
3189
 
1659
3190
  for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1660
- const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
3191
+ const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
1661
3192
 
1662
- const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
3193
+ const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
1663
3194
 
1664
- const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
3195
+ const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
1665
3196
 
1666
3197
  tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
1667
3198
  }
@@ -1694,11 +3225,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1694
3225
  const int y_offset = qr == 1 ? 1 : qk/2;
1695
3226
 
1696
3227
  // partial sum for each thread
1697
- #ifdef GGML_CUDA_DMMV_F16
3228
+ #ifdef GGML_CUDA_F16
1698
3229
  half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
1699
3230
  #else
1700
3231
  float tmp = 0.0f;
1701
- #endif // GGML_CUDA_DMMV_F16
3232
+ #endif // GGML_CUDA_F16
1702
3233
 
1703
3234
  for (int i = 0; i < ncols; i += iter_stride) {
1704
3235
  const int col = i + vals_per_iter*tid;
@@ -1718,7 +3249,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1718
3249
 
1719
3250
  // matrix multiplication
1720
3251
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
1721
- #ifdef GGML_CUDA_DMMV_F16
3252
+ #ifdef GGML_CUDA_F16
1722
3253
  tmp += __hmul2(v, {
1723
3254
  y[iybs + iqs + j/qr + 0],
1724
3255
  y[iybs + iqs + j/qr + y_offset]
@@ -1726,7 +3257,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1726
3257
  #else
1727
3258
  tmp += v.x * y[iybs + iqs + j/qr + 0];
1728
3259
  tmp += v.y * y[iybs + iqs + j/qr + y_offset];
1729
- #endif // GGML_CUDA_DMMV_F16
3260
+ #endif // GGML_CUDA_F16
1730
3261
  }
1731
3262
  }
1732
3263
 
@@ -1737,19 +3268,23 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1737
3268
  }
1738
3269
 
1739
3270
  if (tid == 0) {
1740
- #ifdef GGML_CUDA_DMMV_F16
3271
+ #ifdef GGML_CUDA_F16
1741
3272
  dst[row] = tmp.x + tmp.y;
1742
3273
  #else
1743
3274
  dst[row] = tmp;
1744
- #endif // GGML_CUDA_DMMV_F16
3275
+ #endif // GGML_CUDA_F16
1745
3276
  }
1746
3277
  }
1747
3278
 
1748
- static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
3279
+ static __global__ void mul_mat_p021_f16_f32(
3280
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
3281
+ const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
3282
+
1749
3283
  const half * x = (const half *) vx;
1750
3284
 
1751
3285
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1752
3286
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
3287
+ const int channel_x = channel / (nchannels_y / nchannels_x);
1753
3288
 
1754
3289
  const int nrows_y = ncols_x;
1755
3290
  const int nrows_dst = nrows_x;
@@ -1765,7 +3300,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
1765
3300
  }
1766
3301
 
1767
3302
  // x is transposed and permuted
1768
- const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
3303
+ const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
1769
3304
  const float xi = __half2float(x[ix]);
1770
3305
 
1771
3306
  const int row_y = col_x;
@@ -1793,12 +3328,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
1793
3328
 
1794
3329
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1795
3330
  const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1796
- const int row_stride_x, const int channel_stride_x) {
3331
+ const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
1797
3332
 
1798
3333
  const half * x = (const half *) vx;
1799
3334
 
1800
3335
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1801
3336
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
3337
+ const int channel_x = channel / channel_x_divisor;
1802
3338
 
1803
3339
  const int nrows_y = ncols_x;
1804
3340
  const int nrows_dst = nrows_x;
@@ -1815,7 +3351,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1815
3351
  break;
1816
3352
  }
1817
3353
 
1818
- const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
3354
+ const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
1819
3355
  const float xi = __half2float(x[ix]);
1820
3356
 
1821
3357
  const int row_y = col_x;
@@ -1876,7 +3412,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
1876
3412
  }
1877
3413
 
1878
3414
  // rope == RoPE == rotary positional embedding
1879
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
3415
+ static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3416
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
1880
3417
  const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
1881
3418
 
1882
3419
  if (col >= ncols) {
@@ -1886,7 +3423,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
1886
3423
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
1887
3424
  const int i = row*ncols + col;
1888
3425
 
1889
- const float theta = p*powf(theta_scale, col/2);
3426
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
1890
3427
  const float sin_theta = sinf(theta);
1891
3428
  const float cos_theta = cosf(theta);
1892
3429
 
@@ -2027,15 +3564,17 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
2027
3564
  norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
2028
3565
  }
2029
3566
 
2030
- static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
3567
+ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
2031
3568
  GGML_ASSERT(ncols % WARP_SIZE == 0);
2032
3569
  const dim3 block_dims(WARP_SIZE, 1, 1);
2033
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
3570
+ rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
2034
3571
  }
2035
3572
 
2036
- static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
2037
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
2038
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
3573
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
3574
+ const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
3575
+ const dim3 num_blocks(block_num_x, ky, 1);
3576
+ const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
3577
+ quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
2039
3578
  }
2040
3579
 
2041
3580
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2196,7 +3735,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
2196
3735
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2197
3736
  const dim3 block_nums(1, block_num_y, 1);
2198
3737
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2199
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
3738
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
2200
3739
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2201
3740
  }
2202
3741
 
@@ -2205,7 +3744,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
2205
3744
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2206
3745
  const dim3 block_nums(1, block_num_y, 1);
2207
3746
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2208
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
3747
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
2209
3748
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2210
3749
  }
2211
3750
 
@@ -2214,7 +3753,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
2214
3753
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2215
3754
  const dim3 block_nums(1, block_num_y, 1);
2216
3755
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2217
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
3756
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
2218
3757
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2219
3758
  }
2220
3759
 
@@ -2223,7 +3762,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
2223
3762
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2224
3763
  const dim3 block_nums(1, block_num_y, 1);
2225
3764
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2226
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
3765
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
2227
3766
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2228
3767
  }
2229
3768
 
@@ -2232,7 +3771,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
2232
3771
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2233
3772
  const dim3 block_nums(1, block_num_y, 1);
2234
3773
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2235
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
3774
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
2236
3775
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2237
3776
  }
2238
3777
 
@@ -2241,7 +3780,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
2241
3780
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2242
3781
  const dim3 block_nums(1, block_num_y, 1);
2243
3782
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2244
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
3783
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1>
2245
3784
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2246
3785
  }
2247
3786
 
@@ -2250,7 +3789,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
2250
3789
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2251
3790
  const dim3 block_nums(1, block_num_y, 1);
2252
3791
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2253
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
3792
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1>
2254
3793
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2255
3794
  }
2256
3795
 
@@ -2259,7 +3798,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
2259
3798
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2260
3799
  const dim3 block_nums(1, block_num_y, 1);
2261
3800
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2262
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
3801
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1>
2263
3802
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2264
3803
  }
2265
3804
 
@@ -2268,7 +3807,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
2268
3807
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2269
3808
  const dim3 block_nums(1, block_num_y, 1);
2270
3809
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2271
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
3810
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1>
2272
3811
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2273
3812
  }
2274
3813
 
@@ -2277,7 +3816,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
2277
3816
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2278
3817
  const dim3 block_nums(1, block_num_y, 1);
2279
3818
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2280
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
3819
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1>
2281
3820
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2282
3821
  }
2283
3822
 
@@ -2324,20 +3863,203 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
2324
3863
  }
2325
3864
  }
2326
3865
 
2327
- static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
2328
- const dim3 block_nums(1, nrows_x, nchannels_x);
3866
+ static void ggml_mul_mat_q4_0_q8_1_cuda(
3867
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3868
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3869
+
3870
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3871
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3872
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3873
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3874
+
3875
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3876
+ mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3877
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3878
+ } else {
3879
+ mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3880
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3881
+ }
3882
+ }
3883
+
3884
+ static void ggml_mul_mat_q4_1_q8_1_cuda(
3885
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3886
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3887
+
3888
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3889
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3890
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3891
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3892
+
3893
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3894
+ mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3895
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3896
+ } else {
3897
+ mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3898
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3899
+ }
3900
+ }
3901
+
3902
+ static void ggml_mul_mat_q5_0_q8_1_cuda(
3903
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3904
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3905
+
3906
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3907
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3908
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3909
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3910
+
3911
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3912
+ mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3913
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3914
+ } else {
3915
+ mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3916
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3917
+ }
3918
+ }
3919
+
3920
+ static void ggml_mul_mat_q5_1_q8_1_cuda(
3921
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3922
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3923
+
3924
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3925
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3926
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3927
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3928
+
3929
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3930
+ mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3931
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3932
+ } else {
3933
+ mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3934
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3935
+ }
3936
+ }
3937
+
3938
+ static void ggml_mul_mat_q8_0_q8_1_cuda(
3939
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3940
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3941
+
3942
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3943
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3944
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3945
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3946
+
3947
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3948
+ mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3949
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3950
+ } else {
3951
+ mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3952
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3953
+ }
3954
+ }
3955
+
3956
+ static void ggml_mul_mat_q2_K_q8_1_cuda(
3957
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3958
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3959
+
3960
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3961
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3962
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3963
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3964
+
3965
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3966
+ mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3967
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3968
+ } else {
3969
+ mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3970
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3971
+ }
3972
+ }
3973
+
3974
+ static void ggml_mul_mat_q3_K_q8_1_cuda(
3975
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3976
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3977
+
3978
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3979
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3980
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3981
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3982
+
3983
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3984
+ mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3985
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
+ } else {
3987
+ mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3988
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3989
+ }
3990
+ }
3991
+
3992
+ static void ggml_mul_mat_q4_K_q8_1_cuda(
3993
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3994
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3995
+
3996
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3997
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3998
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3999
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4000
+
4001
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4002
+ mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4003
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4004
+ } else {
4005
+ mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4006
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4007
+ }
4008
+ }
4009
+
4010
+ static void ggml_mul_mat_q5_K_q8_1_cuda(
4011
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4012
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4013
+
4014
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4015
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4016
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4017
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4018
+
4019
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4020
+ mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4021
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4022
+ } else {
4023
+ mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4024
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4025
+ }
4026
+ }
4027
+
4028
+ static void ggml_mul_mat_q6_K_q8_1_cuda(
4029
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4030
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4031
+
4032
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4033
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4034
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4035
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4036
+
4037
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4038
+ mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4039
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4040
+ } else {
4041
+ mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4042
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4043
+ }
4044
+ }
4045
+
4046
+ static void ggml_mul_mat_p021_f16_f32_cuda(
4047
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
4048
+ const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
4049
+
4050
+ const dim3 block_nums(1, nrows_x, nchannels_y);
2329
4051
  const dim3 block_dims(WARP_SIZE, 1, 1);
2330
- mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
4052
+ mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
2331
4053
  }
2332
4054
 
2333
4055
  static void ggml_mul_mat_vec_nc_f16_f32_cuda(
2334
4056
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
2335
- const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
4057
+ const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
2336
4058
 
2337
- const dim3 block_nums(1, nrows_x, nchannels_x);
4059
+ const dim3 block_nums(1, nrows_x, nchannels_y);
2338
4060
  const dim3 block_dims(WARP_SIZE, 1, 1);
2339
4061
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
2340
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
4062
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
2341
4063
  }
2342
4064
 
2343
4065
  static void ggml_cpy_f32_f32_cuda(
@@ -2365,12 +4087,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
2365
4087
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
2366
4088
  }
2367
4089
 
2368
- static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
4090
+ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4091
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
2369
4092
  GGML_ASSERT(nrows % 2 == 0);
2370
4093
  const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2371
4094
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
2372
4095
  const dim3 block_nums(num_blocks_x, nrows, 1);
2373
- rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
4096
+ rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
2374
4097
  }
2375
4098
 
2376
4099
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
@@ -2499,6 +4222,7 @@ static int g_device_count = -1;
2499
4222
  static int g_main_device = 0;
2500
4223
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
2501
4224
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
4225
+ static bool g_mul_mat_q = false;
2502
4226
 
2503
4227
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
2504
4228
 
@@ -2688,6 +4412,7 @@ inline void ggml_cuda_op_mul(
2688
4412
  (void) dst;
2689
4413
  (void) src0_ddq_i;
2690
4414
  (void) i02;
4415
+ (void) i1;
2691
4416
  }
2692
4417
 
2693
4418
  inline void ggml_cuda_op_gelu(
@@ -2767,8 +4492,11 @@ inline void ggml_cuda_op_rms_norm(
2767
4492
  const int64_t ne00 = src0->ne[0];
2768
4493
  const int64_t i01_diff = i01_high - i01_low;
2769
4494
 
4495
+ float eps;
4496
+ memcpy(&eps, dst->op_params, sizeof(float));
4497
+
2770
4498
  // compute
2771
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
4499
+ rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
2772
4500
 
2773
4501
  (void) src1;
2774
4502
  (void) dst;
@@ -2778,6 +4506,83 @@ inline void ggml_cuda_op_rms_norm(
2778
4506
  (void) i1;
2779
4507
  }
2780
4508
 
4509
+ inline void ggml_cuda_op_mul_mat_q(
4510
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
4511
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
4512
+ cudaStream_t & cudaStream_main){
4513
+
4514
+ GGML_ASSERT(src0_ddq_i != nullptr);
4515
+ GGML_ASSERT(src1_ddf_i != nullptr);
4516
+ GGML_ASSERT(dst_ddf_i != nullptr);
4517
+
4518
+ const int64_t ne00 = src0->ne[0];
4519
+
4520
+ const int64_t ne10 = src1->ne[0];
4521
+ const int64_t ne11 = src1->ne[1];
4522
+ GGML_ASSERT(ne10 % QK8_1 == 0);
4523
+
4524
+ const int64_t ne0 = dst->ne[0];
4525
+
4526
+ const int64_t i01_diff = i01_high - i01_low;
4527
+
4528
+ int id;
4529
+ CUDA_CHECK(cudaGetDevice(&id));
4530
+
4531
+ // the main device has a larger memory buffer to hold the results from all GPUs
4532
+ // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
4533
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
4534
+
4535
+ const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
4536
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
4537
+ size_t as;
4538
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
4539
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
4540
+
4541
+ switch (src0->type) {
4542
+ case GGML_TYPE_Q4_0:
4543
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4544
+ break;
4545
+ case GGML_TYPE_Q4_1:
4546
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4547
+ break;
4548
+ case GGML_TYPE_Q5_0:
4549
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4550
+ break;
4551
+ case GGML_TYPE_Q5_1:
4552
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4553
+ break;
4554
+ case GGML_TYPE_Q8_0:
4555
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4556
+ break;
4557
+ case GGML_TYPE_Q2_K:
4558
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4559
+ break;
4560
+ case GGML_TYPE_Q3_K:
4561
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4562
+ break;
4563
+ case GGML_TYPE_Q4_K:
4564
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4565
+ break;
4566
+ case GGML_TYPE_Q5_K:
4567
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4568
+ break;
4569
+ case GGML_TYPE_Q6_K:
4570
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4571
+ break;
4572
+ default:
4573
+ GGML_ASSERT(false);
4574
+ break;
4575
+ }
4576
+
4577
+ ggml_cuda_pool_free(src1_q8_1, as);
4578
+
4579
+ (void) src1;
4580
+ (void) dst;
4581
+ (void) src0_ddf_i;
4582
+ (void) i02;
4583
+ (void) i1;
4584
+ }
4585
+
2781
4586
  inline void ggml_cuda_op_mul_mat_vec(
2782
4587
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2783
4588
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2792,6 +4597,7 @@ inline void ggml_cuda_op_mul_mat_vec(
2792
4597
 
2793
4598
  #ifdef GGML_CUDA_FORCE_DMMV
2794
4599
  const bool use_mul_mat_vec_q = false;
4600
+ (void) g_compute_capabilities[0];
2795
4601
  #else
2796
4602
  int id;
2797
4603
  CUDA_CHECK(cudaGetDevice(&id));
@@ -2815,11 +4621,11 @@ inline void ggml_cuda_op_mul_mat_vec(
2815
4621
  #endif
2816
4622
 
2817
4623
  if (use_mul_mat_vec_q) {
2818
- int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2819
- padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
4624
+ const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
4625
+ ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
2820
4626
  size_t as;
2821
4627
  void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2822
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
4628
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
2823
4629
 
2824
4630
  switch (src0->type) {
2825
4631
  case GGML_TYPE_Q4_0:
@@ -2860,7 +4666,7 @@ inline void ggml_cuda_op_mul_mat_vec(
2860
4666
  ggml_cuda_pool_free(src1_q8_1, as);
2861
4667
  } else {
2862
4668
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
2863
- #ifdef GGML_CUDA_DMMV_F16
4669
+ #ifdef GGML_CUDA_F16
2864
4670
  size_t ash;
2865
4671
  dfloat * src1_dfloat = nullptr; // dfloat == half
2866
4672
 
@@ -2876,7 +4682,7 @@ inline void ggml_cuda_op_mul_mat_vec(
2876
4682
  }
2877
4683
  #else
2878
4684
  dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2879
- #endif // GGML_CUDA_DMMV_F16
4685
+ #endif // GGML_CUDA_F16
2880
4686
 
2881
4687
  switch (src0->type) {
2882
4688
  case GGML_TYPE_Q4_0:
@@ -2917,11 +4723,11 @@ inline void ggml_cuda_op_mul_mat_vec(
2917
4723
  break;
2918
4724
  }
2919
4725
 
2920
- #ifdef GGML_CUDA_DMMV_F16
4726
+ #ifdef GGML_CUDA_F16
2921
4727
  if (src1_convert_f16) {
2922
4728
  ggml_cuda_pool_free(src1_dfloat, ash);
2923
4729
  }
2924
- #endif // GGML_CUDA_DMMV_F16
4730
+ #endif // GGML_CUDA_F16
2925
4731
  }
2926
4732
 
2927
4733
  (void) src1;
@@ -2981,32 +4787,35 @@ inline void ggml_cuda_op_rope(
2981
4787
  GGML_ASSERT(dst_ddf_i != nullptr);
2982
4788
 
2983
4789
  const int64_t ne00 = src0->ne[0];
4790
+ const int64_t ne01 = src0->ne[1];
2984
4791
  const int64_t i01_diff = i01_high - i01_low;
2985
4792
 
2986
- const int n_past = ((int32_t *) src1->data)[0];
2987
- const int n_dims = ((int32_t *) src1->data)[1];
2988
- const int mode = ((int32_t *) src1->data)[2];
2989
- const int n_ctx = ((int32_t *) src1->data)[3];
2990
-
4793
+ const int n_past = ((int32_t *) dst->op_params)[0];
4794
+ const int n_dims = ((int32_t *) dst->op_params)[1];
4795
+ const int mode = ((int32_t *) dst->op_params)[2];
4796
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
2991
4797
  // RoPE alteration for extended context
4798
+
2992
4799
  float freq_base, freq_scale;
2993
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
2994
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
4800
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
4801
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
2995
4802
 
2996
4803
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
2997
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
2998
4804
 
2999
- bool is_glm = mode & 4;
4805
+ const bool is_glm = mode & 4;
3000
4806
 
3001
4807
  // compute
3002
4808
  if (is_glm) {
4809
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3003
4810
  const float id_p = min(p, n_ctx - 2.f);
3004
4811
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
3005
4812
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
3006
4813
  } else {
3007
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
4814
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
4815
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
3008
4816
  }
3009
4817
 
4818
+ (void) src1;
3010
4819
  (void) dst;
3011
4820
  (void) src0_ddq_i;
3012
4821
  (void) src1_ddf_i;
@@ -3025,11 +4834,12 @@ inline void ggml_cuda_op_diag_mask_inf(
3025
4834
  const int64_t ne01 = src0->ne[1];
3026
4835
  const int64_t i01_diff = i01_high - i01_low;
3027
4836
 
3028
- const int n_past = ((int32_t *) src1->data)[0];
4837
+ const int n_past = ((int32_t *) dst->op_params)[0];
3029
4838
 
3030
4839
  // compute
3031
4840
  diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
3032
4841
 
4842
+ (void) src1;
3033
4843
  (void) dst;
3034
4844
  (void) src0_ddq_i;
3035
4845
  (void) src1_ddf_i;
@@ -3097,6 +4907,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3097
4907
  const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
3098
4908
  const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
3099
4909
  const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
4910
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
4911
+
4912
+ GGML_ASSERT(ne03 == ne13);
3100
4913
 
3101
4914
  const int64_t ne0 = dst->ne[0];
3102
4915
  const int64_t ne1 = dst->ne[1];
@@ -3108,12 +4921,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3108
4921
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
3109
4922
 
3110
4923
  // strides for iteration over dims 3 and 2
3111
- const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
3112
- const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
4924
+ const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
4925
+ const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
4926
+ const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
3113
4927
  const int64_t src0_stride = ne00 * ne01 * stride_mod;
3114
4928
  const int64_t src1_stride = ne10 * ne11 * stride_mod;
3115
4929
  const int64_t dst_stride = ne0 * ne1 * stride_mod;
3116
4930
 
4931
+ const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
4932
+ const int64_t i03_max = flatten_rows ? 1 : ne03;
4933
+ const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
4934
+ const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
4935
+ GGML_ASSERT(!(flatten_rows && ne02 < ne12));
4936
+
3117
4937
  const size_t src0_ts = ggml_type_size(src0->type);
3118
4938
  const size_t src0_bs = ggml_blck_size(src0->type);
3119
4939
 
@@ -3130,6 +4950,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3130
4950
  dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
3131
4951
 
3132
4952
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
4953
+ GGML_ASSERT(!(split && ne02 < ne12));
3133
4954
 
3134
4955
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
3135
4956
 
@@ -3163,10 +4984,17 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3163
4984
  int64_t row_low, row_high;
3164
4985
  if (split) {
3165
4986
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
3166
- row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
4987
+ row_low -= row_low % GGML_CUDA_MMQ_Y;
4988
+
4989
+ if (id == g_device_count - 1) {
4990
+ row_high = nrows0;
4991
+ } else {
4992
+ row_high = nrows0*g_tensor_split[id + 1];
4993
+ row_high -= row_high % GGML_CUDA_MMQ_Y;
4994
+ }
3167
4995
  } else {
3168
4996
  row_low = 0;
3169
- row_high = nrows0;
4997
+ row_high = nrows0*i02_divisor;
3170
4998
  }
3171
4999
  if (row_low == row_high) {
3172
5000
  continue;
@@ -3214,16 +5042,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3214
5042
  dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
3215
5043
  }
3216
5044
 
3217
- const int64_t i03_max = flatten_rows ? 1 : ne03;
3218
- const int64_t i02_max = flatten_rows ? 1 : ne02;
3219
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
3220
-
3221
5045
  for (int64_t i03 = 0; i03 < i03_max; i03++) {
3222
5046
  const int64_t i13 = i03 % ne13;
3223
5047
  for (int64_t i02 = 0; i02 < i02_max; i02++) {
3224
5048
  const int64_t i12 = i02 % ne12;
3225
5049
 
3226
- const int64_t i0 = i03*ne02 + i02;
5050
+ const int64_t i0 = i03*i02_max + i02;
3227
5051
 
3228
5052
  // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
3229
5053
  const int64_t i0_offset_low = row_low/rows_per_iter;
@@ -3257,10 +5081,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3257
5081
  const int64_t i11 = i13*ne12 + i12;
3258
5082
 
3259
5083
  // for split tensors the data begins at i0 == i0_offset_low
3260
- char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
3261
- float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
5084
+ char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
5085
+ float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
3262
5086
  float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
3263
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
5087
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
3264
5088
 
3265
5089
  // for split tensors the data pointer needs to be rounded down
3266
5090
  // to the bin edge for i03, i02 bins beyond the first
@@ -3299,11 +5123,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3299
5123
  }
3300
5124
  }
3301
5125
 
3302
- if (!src0_on_device || !src0_is_contiguous) {
5126
+ if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
3303
5127
  if (src0_is_f32) {
3304
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
5128
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
3305
5129
  } else {
3306
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
5130
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
3307
5131
  }
3308
5132
  }
3309
5133
 
@@ -3333,13 +5157,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3333
5157
  if (split) {
3334
5158
  // src0 = weight matrix is saved as a transposed matrix for better memory layout.
3335
5159
  // dst is NOT transposed.
3336
- // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
5160
+ // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
3337
5161
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
3338
5162
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
3339
- for (int64_t j = 0; j < ne1; ++j) {
3340
- float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
3341
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
3342
- }
5163
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
5164
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
5165
+ i01_diff*sizeof(float), ne1, kind, cudaStream_main));
3343
5166
  } else {
3344
5167
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
3345
5168
  CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
@@ -3457,6 +5280,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
3457
5280
  const int64_t ne01 = src0->ne[1];
3458
5281
  const int64_t ne02 = src0->ne[2];
3459
5282
 
5283
+ const int64_t ne12 = src1->ne[2];
5284
+
3460
5285
  CUDA_CHECK(cudaSetDevice(g_main_device));
3461
5286
  cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
3462
5287
 
@@ -3469,7 +5294,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
3469
5294
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
3470
5295
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
3471
5296
 
3472
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
5297
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
3473
5298
  }
3474
5299
 
3475
5300
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -3483,6 +5308,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
3483
5308
  const int64_t ne01 = src0->ne[1];
3484
5309
  const int64_t ne02 = src0->ne[2];
3485
5310
 
5311
+ const int64_t ne12 = src1->ne[2];
5312
+
3486
5313
  const int64_t nb01 = src0->nb[1];
3487
5314
  const int64_t nb02 = src0->nb[2];
3488
5315
 
@@ -3501,7 +5328,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
3501
5328
  const int row_stride_x = nb01 / sizeof(half);
3502
5329
  const int channel_stride_x = nb02 / sizeof(half);
3503
5330
 
3504
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
5331
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
3505
5332
  }
3506
5333
 
3507
5334
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3518,7 +5345,18 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
3518
5345
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
3519
5346
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
3520
5347
  } else {
3521
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
5348
+ int min_compute_capability = INT_MAX;
5349
+ for (int id = 0; id < g_device_count; ++id) {
5350
+ if (min_compute_capability > g_compute_capabilities[id]) {
5351
+ min_compute_capability = g_compute_capabilities[id];
5352
+ }
5353
+ }
5354
+
5355
+ if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
5356
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
5357
+ } else {
5358
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
5359
+ }
3522
5360
  }
3523
5361
  } else {
3524
5362
  GGML_ASSERT(false);
@@ -3595,7 +5433,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
3595
5433
 
3596
5434
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3597
5435
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3598
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
5436
+
5437
+ const int mode = ((int32_t *) dst->op_params)[2];
5438
+ const bool is_glm = mode & 4;
5439
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
3599
5440
  }
3600
5441
 
3601
5442
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3628,7 +5469,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3628
5469
  row_high = nrows;
3629
5470
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
3630
5471
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
3631
- row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
5472
+ row_low -= row_low % GGML_CUDA_MMQ_Y;
5473
+
5474
+ if (id == g_device_count - 1) {
5475
+ row_high = nrows;
5476
+ } else {
5477
+ row_high = nrows*g_tensor_split[id + 1];
5478
+ row_high -= row_high % GGML_CUDA_MMQ_Y;
5479
+ }
3632
5480
  } else {
3633
5481
  GGML_ASSERT(false);
3634
5482
  }
@@ -3642,7 +5490,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3642
5490
  size_t size = ggml_nbytes_split(tensor, nrows_split);
3643
5491
  const size_t original_size = size;
3644
5492
 
3645
- // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
5493
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
3646
5494
  if (ne0 % MATRIX_ROW_PADDING != 0) {
3647
5495
  size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3648
5496
  * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
@@ -3658,7 +5506,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3658
5506
  }
3659
5507
 
3660
5508
 
3661
- CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
5509
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
3662
5510
 
3663
5511
  extra->data_device[id] = buf;
3664
5512
 
@@ -3738,7 +5586,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3738
5586
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3739
5587
  size_t offset = 0;
3740
5588
  if (tensor->op == GGML_OP_VIEW) {
3741
- memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
5589
+ memcpy(&offset, tensor->op_params, sizeof(size_t));
3742
5590
  }
3743
5591
  extra = ggml_cuda_alloc_temp_tensor_extra();
3744
5592
  extra->data_device[g_main_device] = src0_ddc + offset;
@@ -3802,6 +5650,10 @@ void ggml_cuda_set_main_device(int main_device) {
3802
5650
  }
3803
5651
  }
3804
5652
 
5653
+ void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
5654
+ g_mul_mat_q = mul_mat_q;
5655
+ }
5656
+
3805
5657
  void ggml_cuda_set_scratch_size(size_t scratch_size) {
3806
5658
  g_scratch_size = scratch_size;
3807
5659
  }
@@ -3840,18 +5692,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3840
5692
  }
3841
5693
  func = ggml_cuda_mul;
3842
5694
  break;
3843
- case GGML_OP_GELU:
3844
- if (!any_on_device) {
3845
- return false;
3846
- }
3847
- func = ggml_cuda_gelu;
3848
- break;
3849
- case GGML_OP_SILU:
3850
- if (!any_on_device) {
3851
- return false;
3852
- }
3853
- func = ggml_cuda_silu;
3854
- break;
5695
+ case GGML_OP_UNARY:
5696
+ switch (ggml_get_unary_op(tensor)) {
5697
+ case GGML_UNARY_OP_GELU:
5698
+ if (!any_on_device) {
5699
+ return false;
5700
+ }
5701
+ func = ggml_cuda_gelu;
5702
+ break;
5703
+ case GGML_UNARY_OP_SILU:
5704
+ if (!any_on_device) {
5705
+ return false;
5706
+ }
5707
+ func = ggml_cuda_silu;
5708
+ break;
5709
+ default:
5710
+ return false;
5711
+ } break;
3855
5712
  case GGML_OP_NORM:
3856
5713
  if (!any_on_device) {
3857
5714
  return false;