llama_cpp 0.3.5 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -52,13 +52,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
52
52
  } while (0)
53
53
  #endif // CUDART_VERSION >= 11
54
54
 
55
- #ifdef GGML_CUDA_DMMV_F16
55
+ #ifdef GGML_CUDA_F16
56
56
  typedef half dfloat; // dequantize float
57
57
  typedef half2 dfloat2;
58
58
  #else
59
59
  typedef float dfloat; // dequantize float
60
60
  typedef float2 dfloat2;
61
- #endif //GGML_CUDA_DMMV_F16
61
+ #endif //GGML_CUDA_F16
62
+
63
+ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
64
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
65
+
66
+ int x32 = 0;
67
+ x32 |= x16[0] << 0;
68
+ x32 |= x16[1] << 16;
69
+
70
+ return x32;
71
+ }
72
+
73
+ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
74
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
75
+
76
+ int x32 = 0;
77
+ x32 |= x16[0] << 0;
78
+ x32 |= x16[1] << 16;
79
+
80
+ return x32;
81
+ }
82
+
83
+ static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
84
+ return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
85
+ }
86
+
87
+ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
88
+ return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
89
+ }
62
90
 
63
91
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
64
92
  typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
@@ -87,8 +115,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
87
115
  #define QR4_1 2
88
116
  #define QI4_1 (QK4_1 / (4 * QR4_1))
89
117
  typedef struct {
90
- half d; // delta
91
- half m; // min
118
+ half2 dm; // dm.x = delta, dm.y = min
92
119
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
93
120
  } block_q4_1;
94
121
  static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -107,8 +134,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
107
134
  #define QR5_1 2
108
135
  #define QI5_1 (QK5_1 / (4 * QR5_1))
109
136
  typedef struct {
110
- half d; // delta
111
- half m; // min
137
+ half2 dm; // dm.x = delta, dm.y = min
112
138
  uint8_t qh[4]; // 5-th bit of quants
113
139
  uint8_t qs[QK5_1 / 2]; // nibbles / quants
114
140
  } block_q5_1;
@@ -127,13 +153,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
127
153
  #define QR8_1 1
128
154
  #define QI8_1 (QK8_1 / (4 * QR8_1))
129
155
  typedef struct {
130
- half d; // delta
131
- half s; // unquantized sum
156
+ half2 ds; // ds.x = delta, ds.y = sum
132
157
  int8_t qs[QK8_0]; // quants
133
158
  } block_q8_1;
134
159
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
135
160
 
136
- typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
161
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
162
+ typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
163
+ typedef void (*load_tiles_cuda_t)(
164
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
165
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
166
+ typedef float (*vec_dot_q_mul_mat_cuda_t)(
167
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
168
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
137
169
 
138
170
  //================================= k-quants
139
171
 
@@ -150,8 +182,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
150
182
  typedef struct {
151
183
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
152
184
  uint8_t qs[QK_K/4]; // quants
153
- half d; // super-block scale for quantized scales
154
- half dmin; // super-block scale for quantized mins
185
+ half2 dm; // super-block scale for quantized scales/mins
155
186
  } block_q2_K;
156
187
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
157
188
 
@@ -180,8 +211,7 @@ typedef struct {
180
211
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
181
212
  #else
182
213
  typedef struct {
183
- half d; // super-block scale for quantized scales
184
- half dmin; // super-block scale for quantized mins
214
+ half2 dm; // super-block scale for quantized scales/mins
185
215
  uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
186
216
  uint8_t qs[QK_K/2]; // 4--bit quants
187
217
  } block_q4_K;
@@ -200,11 +230,10 @@ typedef struct {
200
230
  static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
201
231
  #else
202
232
  typedef struct {
203
- half d; // super-block scale for quantized scales
204
- half dmin; // super-block scale for quantized mins
205
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
206
- uint8_t qh[QK_K/8]; // quants, high bit
207
- uint8_t qs[QK_K/2]; // quants, low 4 bits
233
+ half2 dm; // super-block scale for quantized scales/mins
234
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
235
+ uint8_t qh[QK_K/8]; // quants, high bit
236
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
208
237
  } block_q5_K;
209
238
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
210
239
  #endif
@@ -233,6 +262,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
233
262
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
234
263
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
235
264
 
265
+ #ifndef GGML_CUDA_MMQ_Y
266
+ #define GGML_CUDA_MMQ_Y 64
267
+ #endif // GGML_CUDA_MMQ_Y
268
+
236
269
  // dmmv = dequantize_mul_mat_vec
237
270
  #ifndef GGML_CUDA_DMMV_X
238
271
  #define GGML_CUDA_DMMV_X 32
@@ -367,33 +400,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
367
400
  v.x = vui & 0xF;
368
401
  v.y = vui >> 4;
369
402
 
370
- #ifdef GGML_CUDA_DMMV_F16
403
+ #ifdef GGML_CUDA_F16
371
404
  v = __hsub2(v, {8.0f, 8.0f});
372
405
  v = __hmul2(v, {d, d});
373
406
  #else
374
407
  v.x = (v.x - 8.0f) * d;
375
408
  v.y = (v.y - 8.0f) * d;
376
- #endif // GGML_CUDA_DMMV_F16
409
+ #endif // GGML_CUDA_F16
377
410
  }
378
411
 
379
412
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
380
413
  const block_q4_1 * x = (const block_q4_1 *) vx;
381
414
 
382
- const dfloat d = x[ib].d;
383
- const dfloat m = x[ib].m;
415
+ const dfloat d = x[ib].dm.x;
416
+ const dfloat m = x[ib].dm.y;
384
417
 
385
418
  const int vui = x[ib].qs[iqs];
386
419
 
387
420
  v.x = vui & 0xF;
388
421
  v.y = vui >> 4;
389
422
 
390
- #ifdef GGML_CUDA_DMMV_F16
423
+ #ifdef GGML_CUDA_F16
391
424
  v = __hmul2(v, {d, d});
392
425
  v = __hadd2(v, {m, m});
393
426
  #else
394
427
  v.x = (v.x * d) + m;
395
428
  v.y = (v.y * d) + m;
396
- #endif // GGML_CUDA_DMMV_F16
429
+ #endif // GGML_CUDA_F16
397
430
  }
398
431
 
399
432
  static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
@@ -410,20 +443,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
410
443
  v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
411
444
  v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
412
445
 
413
- #ifdef GGML_CUDA_DMMV_F16
446
+ #ifdef GGML_CUDA_F16
414
447
  v = __hsub2(v, {16.0f, 16.0f});
415
448
  v = __hmul2(v, {d, d});
416
449
  #else
417
450
  v.x = (v.x - 16.0f) * d;
418
451
  v.y = (v.y - 16.0f) * d;
419
- #endif // GGML_CUDA_DMMV_F16
452
+ #endif // GGML_CUDA_F16
420
453
  }
421
454
 
422
455
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
423
456
  const block_q5_1 * x = (const block_q5_1 *) vx;
424
457
 
425
- const dfloat d = x[ib].d;
426
- const dfloat m = x[ib].m;
458
+ const dfloat d = x[ib].dm.x;
459
+ const dfloat m = x[ib].dm.y;
427
460
 
428
461
  uint32_t qh;
429
462
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -434,13 +467,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
434
467
  v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
435
468
  v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
436
469
 
437
- #ifdef GGML_CUDA_DMMV_F16
470
+ #ifdef GGML_CUDA_F16
438
471
  v = __hmul2(v, {d, d});
439
472
  v = __hadd2(v, {m, m});
440
473
  #else
441
474
  v.x = (v.x * d) + m;
442
475
  v.y = (v.y * d) + m;
443
- #endif // GGML_CUDA_DMMV_F16
476
+ #endif // GGML_CUDA_F16
444
477
  }
445
478
 
446
479
  static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
@@ -451,12 +484,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
451
484
  v.x = x[ib].qs[iqs + 0];
452
485
  v.y = x[ib].qs[iqs + 1];
453
486
 
454
- #ifdef GGML_CUDA_DMMV_F16
487
+ #ifdef GGML_CUDA_F16
455
488
  v = __hmul2(v, {d, d});
456
489
  #else
457
490
  v.x *= d;
458
491
  v.y *= d;
459
- #endif // GGML_CUDA_DMMV_F16
492
+ #endif // GGML_CUDA_F16
460
493
  }
461
494
 
462
495
  //================================== k-quants
@@ -475,8 +508,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
475
508
  const uint8_t q = x[i].qs[32*n + l];
476
509
  float * y = yy + i*QK_K + 128*n;
477
510
 
478
- float dall = x[i].d;
479
- float dmin = x[i].dmin;
511
+ float dall = x[i].dm.x;
512
+ float dmin = x[i].dm.y;
480
513
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
481
514
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
482
515
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -486,8 +519,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
486
519
  const int il = tid%16; // 0...15
487
520
  const uint8_t q = x[i].qs[il] >> (2*is);
488
521
  float * y = yy + i*QK_K + 16*is + il;
489
- float dall = x[i].d;
490
- float dmin = x[i].dmin;
522
+ float dall = x[i].dm.x;
523
+ float dmin = x[i].dm.y;
491
524
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
492
525
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
493
526
  #endif
@@ -573,8 +606,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
573
606
 
574
607
  float * y = yy + i*QK_K + 64*il + n*ir;
575
608
 
576
- const float dall = x[i].d;
577
- const float dmin = x[i].dmin;
609
+ const float dall = x[i].dm.x;
610
+ const float dmin = x[i].dm.y;
578
611
 
579
612
  const uint8_t * q = x[i].qs + 32*il + n*ir;
580
613
 
@@ -612,8 +645,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
612
645
 
613
646
  float * y = yy + i*QK_K + 64*il + 2*ir;
614
647
 
615
- const float dall = x[i].d;
616
- const float dmin = x[i].dmin;
648
+ const float dall = x[i].dm.x;
649
+ const float dmin = x[i].dm.y;
617
650
 
618
651
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
619
652
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -725,8 +758,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
725
758
  const float * y = yy + i * QK_K + y_offset;
726
759
  const uint8_t * q = x[i].qs + q_offset;
727
760
 
728
- const float dall = x[i].d;
729
- const float dmin = x[i].dmin;
761
+ const float dall = x[i].dm.x;
762
+ const float dmin = x[i].dm.y;
730
763
 
731
764
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
732
765
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -768,9 +801,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
768
801
  uaux[0] = s[0] & 0x0f0f0f0f;
769
802
  uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
770
803
 
771
- const half2 * dh = (const half2 *)&x[i].d;
772
-
773
- const float2 dall = __half22float2(dh[0]);
804
+ const float2 dall = __half22float2(x[i].dm);
774
805
 
775
806
  float sum1 = 0, sum2 = 0;
776
807
  for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
@@ -948,8 +979,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
948
979
  const float * y1 = yy + i*QK_K + y_offset;
949
980
  const float * y2 = y1 + 128;
950
981
 
951
- const float dall = x[i].d;
952
- const float dmin = x[i].dmin;
982
+ const float dall = x[i].dm.x;
983
+ const float dmin = x[i].dm.y;
953
984
 
954
985
  const uint16_t * a = (const uint16_t *)x[i].scales;
955
986
  aux[0] = a[im+0] & kmask1;
@@ -1081,8 +1112,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1081
1112
  const float * y1 = yy + i*QK_K + y_offset;
1082
1113
  const float * y2 = y1 + 128;
1083
1114
 
1084
- const float dall = x[i].d;
1085
- const float dmin = x[i].dmin;
1115
+ const float dall = x[i].dm.x;
1116
+ const float dmin = x[i].dm.y;
1086
1117
 
1087
1118
  const uint16_t * a = (const uint16_t *)x[i].scales;
1088
1119
  aux[0] = a[im+0] & kmask1;
@@ -1270,19 +1301,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1270
1301
  v.y = x[ib + iqs + 1];
1271
1302
  }
1272
1303
 
1273
- static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1274
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
1304
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1305
+ const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1275
1306
 
1276
- if (i >= k) {
1307
+ if (ix >= kx_padded) {
1277
1308
  return;
1278
1309
  }
1279
1310
 
1311
+ const int iy = blockDim.y*blockIdx.y + threadIdx.y;
1312
+
1313
+ const int i_padded = iy*kx_padded + ix;
1314
+
1280
1315
  block_q8_1 * y = (block_q8_1 *) vy;
1281
1316
 
1282
- const int ib = i / QK8_1; // block index
1283
- const int iqs = i % QK8_1; // quant index
1317
+ const int ib = i_padded / QK8_1; // block index
1318
+ const int iqs = i_padded % QK8_1; // quant index
1284
1319
 
1285
- const float xi = i < ndata ? x[i] : 0.0f;
1320
+ const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
1286
1321
  float amax = fabsf(xi);
1287
1322
  float sum = xi;
1288
1323
 
@@ -1301,8 +1336,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1301
1336
  return;
1302
1337
  }
1303
1338
 
1304
- y[ib].d = d;
1305
- y[ib].s = sum;
1339
+ y[ib].ds.x = d;
1340
+ y[ib].ds.y = sum;
1306
1341
  }
1307
1342
 
1308
1343
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -1326,485 +1361,1816 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
1326
1361
  y[iybs + iqs + y_offset] = v.y;
1327
1362
  }
1328
1363
 
1329
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1330
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1331
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1332
- const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1364
+ // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
1365
+ // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
1333
1366
 
1334
- int vi;
1335
- memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1336
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1337
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1367
+ #define VDR_Q4_0_Q8_1_MMVQ 2
1368
+ #define VDR_Q4_0_Q8_1_MMQ 4
1338
1369
 
1339
- const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
1370
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
1371
+ const int * v, const int * u, const float & d4, const half2 & ds8) {
1372
+
1373
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1374
+ int sumi = 0;
1340
1375
 
1341
- // subtract 8 from each quantized value
1342
- const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
1343
- const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
1376
+ #pragma unroll
1377
+ for (int i = 0; i < vdr; ++i) {
1378
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
1379
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
1344
1380
 
1345
- // SIMD dot product of quantized values
1346
- int sumi = __dp4a(vi0, ui0, 0);
1347
- sumi = __dp4a(vi1, ui1, sumi);
1381
+ // SIMD dot product of quantized values
1382
+ sumi = __dp4a(vi0, u[2*i+0], sumi);
1383
+ sumi = __dp4a(vi1, u[2*i+1], sumi);
1384
+ }
1348
1385
 
1349
- return sumi*d;
1386
+ // second part effectively subtracts 8 from each quant value
1387
+ return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y));
1350
1388
  #else
1351
1389
  return 0.0f; // only to satisfy the compiler
1352
1390
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1353
1391
  }
1354
1392
 
1355
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1356
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1357
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1358
- const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1393
+ #define VDR_Q4_1_Q8_1_MMVQ 2
1394
+ #define VDR_Q4_1_Q8_1_MMQ 4
1395
+
1396
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
1397
+ const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
1359
1398
 
1360
- const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1361
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1362
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
1399
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1400
+ int sumi = 0;
1363
1401
 
1364
- const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
1365
- const float m = bq4_1->m;
1366
- const float s = bq8_1->s;
1402
+ #pragma unroll
1403
+ for (int i = 0; i < vdr; ++i) {
1404
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
1405
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
1367
1406
 
1368
- const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1369
- const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1407
+ // SIMD dot product of quantized values
1408
+ sumi = __dp4a(vi0, u[2*i+0], sumi);
1409
+ sumi = __dp4a(vi1, u[2*i+1], sumi);
1410
+ }
1370
1411
 
1371
- // SIMD dot product of quantized values
1372
- int sumi = __dp4a(vi0, ui0, 0);
1373
- sumi = __dp4a(vi1, ui1, sumi);
1412
+ #ifdef GGML_CUDA_F16
1413
+ const half2 tmp = __hmul2(dm4, ds8);
1414
+ const float d4d8 = __half2float(tmp.x);
1415
+ const float m4s8 = __half2float(tmp.y);
1416
+ #else
1417
+ const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
1418
+ const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
1419
+ #endif // GGML_CUDA_F16
1374
1420
 
1375
- return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1421
+ // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1422
+ return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1376
1423
  #else
1377
1424
  return 0.0f; // only to satisfy the compiler
1378
1425
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1379
1426
  }
1380
1427
 
1381
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1382
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1428
+ #define VDR_Q5_0_Q8_1_MMVQ 2
1429
+ #define VDR_Q5_0_Q8_1_MMQ 4
1430
+
1431
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
1432
+ const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
1433
+
1383
1434
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1384
- const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1435
+ int sumi = 0;
1436
+
1437
+ for (int i = 0; i < vdr; ++i) {
1438
+ int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1439
+ vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
1440
+ vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
1441
+ vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
1442
+ vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
1443
+ sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
1444
+
1445
+ int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
1446
+ vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
1447
+ vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
1448
+ vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
1449
+ vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
1450
+ sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1451
+ }
1385
1452
 
1386
- int qs;
1387
- memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1388
- const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1389
- const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1390
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1391
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
1392
-
1393
- const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
1394
-
1395
- int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1396
- vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1397
- vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1398
- vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1399
- vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1400
- vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
1401
- int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1402
-
1403
- int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1404
- vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1405
- vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1406
- vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1407
- vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1408
- vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
1409
- sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1410
-
1411
- return sumi*d;
1453
+ // second part effectively subtracts 16 from each quant value
1454
+ return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y));
1412
1455
  #else
1413
1456
  return 0.0f; // only to satisfy the compiler
1414
1457
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1415
1458
  }
1416
1459
 
1417
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1418
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1460
+ #define VDR_Q5_1_Q8_1_MMVQ 2
1461
+ #define VDR_Q5_1_Q8_1_MMQ 4
1462
+
1463
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
1464
+ const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
1465
+
1419
1466
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1420
- const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1467
+ int sumi = 0;
1468
+
1469
+ for (int i = 0; i < vdr; ++i) {
1470
+ int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1471
+ vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
1472
+ vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
1473
+ vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
1474
+ vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
1475
+ sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
1476
+
1477
+ int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
1478
+ vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
1479
+ vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
1480
+ vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
1481
+ vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
1482
+ sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1483
+ }
1484
+
1485
+ #ifdef GGML_CUDA_F16
1486
+ const half2 tmp = __hmul2(dm5, ds8);
1487
+ const float d5d8 = __half2float(tmp.x);
1488
+ const float m5s8 = __half2float(tmp.y);
1489
+ #else
1490
+ const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
1491
+ const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
1492
+ #endif // GGML_CUDA_F16
1493
+
1494
+ // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
1495
+ return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1421
1496
 
1422
- const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
1423
- const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
1424
- const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
1425
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1426
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
1427
-
1428
- const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
1429
- const float m = bq5_1->m;
1430
- const float s = bq8_1->s;
1431
-
1432
- int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1433
- vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1434
- vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1435
- vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1436
- vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1437
- int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1438
-
1439
- int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1440
- vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1441
- vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1442
- vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1443
- vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1444
- sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1445
-
1446
- return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1447
1497
  #else
1448
1498
  return 0.0f; // only to satisfy the compiler
1449
1499
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1450
1500
  }
1451
1501
 
1452
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1453
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1454
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1455
- const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1502
+ #define VDR_Q8_0_Q8_1_MMVQ 2
1503
+ #define VDR_Q8_0_Q8_1_MMQ 8
1456
1504
 
1457
- int vi;
1458
- memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1459
- const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1505
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
1506
+ const int * v, const int * u, const float & d8_0, const half2 & ds8_1) {
1460
1507
 
1461
- const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
1508
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1509
+ int sumi = 0;
1462
1510
 
1463
- // SIMD dot product of quantized values
1464
- int sumi = __dp4a(vi, ui, 0);
1511
+ for (int i = 0; i < vdr; ++i) {
1512
+ // SIMD dot product of quantized values
1513
+ sumi = __dp4a(v[i], u[i], sumi);
1514
+ }
1465
1515
 
1466
- return sumi*d;
1516
+ return sumi * d8_0 * __half2float(ds8_1.x);
1467
1517
  #else
1468
1518
  return 0.0f; // only to satisfy the compiler
1469
1519
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1470
1520
  }
1471
1521
 
1472
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1473
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1522
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
1523
+ const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
1474
1524
 
1475
1525
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1476
- const block_q2_K * bq2_K = (const block_q2_K *) vbq;
1526
+ int sumi = 0;
1477
1527
 
1478
- const int bq8_offset = QR2_K * (iqs / QI8_1);
1479
- const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1528
+ for (int i = 0; i < vdr; ++i) {
1529
+ // SIMD dot product of quantized values
1530
+ sumi = __dp4a(v[i], u[i], sumi);
1531
+ }
1480
1532
 
1481
- float sumf_d = 0.0f;
1482
- float sumf_m = 0.0f;
1533
+ #ifdef GGML_CUDA_F16
1534
+ const half2 tmp = __hmul2(dm8, ds8);
1535
+ const float d8d8 = __half2float(tmp.x);
1536
+ const float m8s8 = __half2float(tmp.y);
1537
+ #else
1538
+ const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x);
1539
+ const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y);
1540
+ #endif // GGML_CUDA_F16
1483
1541
 
1484
- const float d = bq2_K->d;
1485
- const float dmin = bq2_K->dmin;
1542
+ // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1543
+ return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1544
+ #else
1545
+ return 0.0f; // only to satisfy the compiler
1546
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1547
+ }
1486
1548
 
1487
- const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
1549
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1550
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1488
1551
 
1489
- for (int i = 0; i < QR2_K; ++i) {
1490
- const int sc = bq2_K->scales[scale_offset + 2*i];
1552
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1491
1553
 
1492
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1493
- const float d8i = bq8i->d;
1554
+ int v[VDR_Q4_0_Q8_1_MMVQ];
1555
+ int u[2*VDR_Q4_0_Q8_1_MMVQ];
1494
1556
 
1495
- const int vi = (v >> (2*i)) & 0x03030303;
1496
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1557
+ #pragma unroll
1558
+ for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
1559
+ v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
1560
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1561
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
1562
+ }
1563
+
1564
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
1565
+ }
1566
+
1567
+ static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1568
+
1569
+ __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
1570
+ __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
1571
+
1572
+ *x_ql = tile_x_qs;
1573
+ *x_dm = (half2 *) tile_x_d;
1574
+ }
1575
+
1576
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1577
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1578
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1579
+
1580
+ __builtin_assume(i_offset >= 0);
1581
+ __builtin_assume(i_offset < 8);
1582
+ __builtin_assume(k >= 0);
1583
+ __builtin_assume(k < WARP_SIZE);
1584
+
1585
+ const int kbx = k / QI4_0;
1586
+ const int kqsx = k % QI4_0;
1587
+
1588
+ const block_q4_0 * bx0 = (block_q4_0 *) vx;
1589
+
1590
+ float * x_dmf = (float *) x_dm;
1591
+
1592
+ #pragma unroll
1593
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1594
+ int i = i0 + i_offset;
1497
1595
 
1498
- sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
1499
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
1596
+ if (need_check) {
1597
+ i = min(i, i_max);
1598
+ }
1599
+
1600
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
1601
+
1602
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
1603
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1500
1604
  }
1501
1605
 
1502
- return d*sumf_d - dmin*sumf_m;
1503
- #else
1504
- return 0.0f; // only to satisfy the compiler
1505
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1606
+ // const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1607
+ // const int kbxd = k % blocks_per_tile_x_row;
1608
+
1609
+ // #pragma unroll
1610
+ // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
1611
+ // FIXME out-of-bounds
1612
+ // const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1613
+
1614
+ // if (i >= GGML_CUDA_MMQ_Y) {
1615
+ // return;
1616
+ // }
1617
+
1618
+ // const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1619
+
1620
+ // x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
1621
+ // }
1506
1622
  }
1507
1623
 
1508
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1509
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1624
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1625
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1626
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1510
1627
 
1511
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1512
- const block_q3_K * bq3_K = (const block_q3_K *) vbq;
1628
+ __builtin_assume(i >= 0);
1629
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1630
+ __builtin_assume(j >= 0);
1631
+ __builtin_assume(j < WARP_SIZE);
1632
+ __builtin_assume(k >= 0);
1633
+ __builtin_assume(k < WARP_SIZE);
1513
1634
 
1514
- const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
1515
- const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1635
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1636
+ const float * x_dmf = (float *) x_dm;
1516
1637
 
1517
- float sumf = 0.0f;
1638
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
1518
1639
 
1519
- const float d = bq3_K->d;
1640
+ #pragma unroll
1641
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
1642
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1643
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
1644
+ }
1520
1645
 
1521
- int vl;
1522
- memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
1646
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
1647
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
1648
+ y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1649
+ }
1523
1650
 
1524
- int vh;
1525
- memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
1526
- vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
1527
- vh >>= bq8_offset;
1651
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1652
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1528
1653
 
1529
- for (int i = 0; i < QR3_K; ++i) {
1530
- const int isc = scale_offset + 2*i;
1654
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1531
1655
 
1532
- const int isc_low = isc % (QK_K/32);
1533
- const int sc_shift_low = 4 * (isc / (QK_K/32));
1534
- const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
1656
+ int v[VDR_Q4_1_Q8_1_MMVQ];
1657
+ int u[2*VDR_Q4_1_Q8_1_MMVQ];
1535
1658
 
1536
- const int isc_high = isc % (QK_K/64);
1537
- const int sc_shift_high = 2 * (isc / (QK_K/64));
1538
- const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1659
+ #pragma unroll
1660
+ for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
1661
+ v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
1662
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1663
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
1664
+ }
1539
1665
 
1540
- const int sc = (sc_low | sc_high) - 32;
1666
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
1667
+ }
1541
1668
 
1542
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1543
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1544
- const float d8i = bq8i->d;
1669
+ static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1545
1670
 
1546
- const int vil = (vl >> (2*i)) & 0x03030303;
1671
+ __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
1672
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
1547
1673
 
1548
- const int vih = ((vh >> i) << 2) & 0x04040404;
1674
+ *x_ql = tile_x_qs;
1675
+ *x_dm = tile_x_dm;
1676
+ }
1549
1677
 
1550
- const int vi = __vsubss4(vil, vih);
1678
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
1679
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1680
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1681
+
1682
+ __builtin_assume(i_offset >= 0);
1683
+ __builtin_assume(i_offset < 8);
1684
+ __builtin_assume(k >= 0);
1685
+ __builtin_assume(k < WARP_SIZE);
1686
+
1687
+ const int kbx = k / QI4_1;
1688
+ const int kqsx = k % QI4_1;
1689
+
1690
+ const block_q4_1 * bx0 = (block_q4_1 *) vx;
1551
1691
 
1552
- sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1692
+ #pragma unroll
1693
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1694
+ int i = i0 + i_offset;
1695
+
1696
+ if (need_check) {
1697
+ i = min(i, i_max);
1698
+ }
1699
+
1700
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
1701
+
1702
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
1553
1703
  }
1554
1704
 
1555
- return d*sumf;
1556
- #else
1557
- return 0.0f; // only to satisfy the compiler
1558
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1705
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
1706
+ const int kbxd = k % blocks_per_tile_x_row;
1707
+
1708
+ #pragma unroll
1709
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
1710
+ int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
1711
+
1712
+ if (need_check) {
1713
+ i = min(i, i_max);
1714
+ }
1715
+
1716
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
1717
+
1718
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
1719
+ }
1559
1720
  }
1560
1721
 
1561
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1562
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1722
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
1723
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1724
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1563
1725
 
1564
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1565
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1726
+ __builtin_assume(i >= 0);
1727
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1728
+ __builtin_assume(j >= 0);
1729
+ __builtin_assume(j < WARP_SIZE);
1730
+ __builtin_assume(k >= 0);
1731
+ __builtin_assume(k < WARP_SIZE);
1566
1732
 
1567
- float sumf_d = 0.0f;
1568
- float sumf_m = 0.0f;
1733
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1569
1734
 
1570
- #ifndef GGML_QKK_64
1735
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
1571
1736
 
1572
- // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1573
- const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
1737
+ #pragma unroll
1738
+ for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
1739
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1740
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
1741
+ }
1574
1742
 
1575
- const float d = bq4_K->d;
1576
- const float dmin = bq4_K->dmin;
1743
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
1744
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
1745
+ y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1746
+ }
1577
1747
 
1578
- // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
1579
- // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
1580
- // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
1581
- // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
1748
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1749
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1582
1750
 
1583
- const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1584
- const int v1 = q4[0];
1585
- const int v2 = q4[4];
1751
+ const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1586
1752
 
1587
- const uint16_t * scales = (const uint16_t *)bq4_K->scales;
1588
- uint16_t aux[2];
1589
- const int j = bq8_offset/2;
1590
- if (j < 2) {
1591
- aux[0] = scales[j+0] & 0x3f3f;
1592
- aux[1] = scales[j+2] & 0x3f3f;
1593
- } else {
1594
- aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1595
- aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1753
+ int vl[VDR_Q5_0_Q8_1_MMVQ];
1754
+ int vh[VDR_Q5_0_Q8_1_MMVQ];
1755
+ int u[2*VDR_Q5_0_Q8_1_MMVQ];
1756
+
1757
+ #pragma unroll
1758
+ for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
1759
+ vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
1760
+ vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
1761
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1762
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
1596
1763
  }
1597
- const uint8_t * sc = (const uint8_t *)aux;
1598
- const uint8_t * m = sc + 2;
1599
1764
 
1600
- for (int i = 0; i < QR4_K; ++i) {
1765
+ return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
1766
+ }
1767
+
1768
+ static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1769
+
1770
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1771
+ __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
1772
+
1773
+ *x_ql = tile_x_ql;
1774
+ *x_dm = (half2 *) tile_x_d;
1775
+ }
1776
+
1777
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
1778
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1779
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1780
+
1781
+ __builtin_assume(i_offset >= 0);
1782
+ __builtin_assume(i_offset < 8);
1783
+ __builtin_assume(k >= 0);
1784
+ __builtin_assume(k < WARP_SIZE);
1785
+
1786
+ const int kbx = k / QI5_0;
1787
+ const int kqsx = k % QI5_0;
1788
+
1789
+ const block_q5_0 * bx0 = (block_q5_0 *) vx;
1790
+
1791
+ #pragma unroll
1792
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1793
+ int i = i0 + i_offset;
1794
+
1795
+ if (need_check) {
1796
+ i = min(i, i_max);
1797
+ }
1798
+
1799
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
1800
+
1801
+ const int ql = get_int_from_uint8(bxi->qs, kqsx);
1802
+ const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
1803
+
1804
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
1805
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
1806
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
1807
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
1808
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
1809
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
1810
+
1811
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
1812
+
1813
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
1814
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
1815
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
1816
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
1817
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
1818
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
1819
+
1820
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
1821
+ }
1822
+
1823
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
1824
+ const int kbxd = k % blocks_per_tile_x_row;
1825
+ float * x_dmf = (float *) x_dm;
1826
+
1827
+ #pragma unroll
1828
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
1829
+ int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
1830
+
1831
+ if (need_check) {
1832
+ i = min(i, i_max);
1833
+ }
1834
+
1835
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1836
+
1837
+ x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
1838
+ }
1839
+ }
1840
+
1841
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
1842
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1843
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1844
+
1845
+ __builtin_assume(i >= 0);
1846
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1847
+ __builtin_assume(j >= 0);
1848
+ __builtin_assume(j < WARP_SIZE);
1849
+ __builtin_assume(k >= 0);
1850
+ __builtin_assume(k < WARP_SIZE);
1851
+
1852
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1853
+ const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
1854
+ const float * x_dmf = (float *) x_dm;
1855
+
1856
+ int u[2*VDR_Q5_0_Q8_1_MMQ];
1857
+
1858
+ #pragma unroll
1859
+ for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
1860
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1861
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
1862
+ }
1863
+
1864
+ return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
1865
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1866
+ }
1867
+
1868
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1869
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1870
+
1871
+ const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1872
+
1873
+ int vl[VDR_Q5_1_Q8_1_MMVQ];
1874
+ int vh[VDR_Q5_1_Q8_1_MMVQ];
1875
+ int u[2*VDR_Q5_1_Q8_1_MMVQ];
1876
+
1877
+ #pragma unroll
1878
+ for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
1879
+ vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
1880
+ vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
1881
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1882
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
1883
+ }
1884
+
1885
+ return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
1886
+ }
1887
+
1888
+ static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1889
+
1890
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1891
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
1892
+
1893
+ *x_ql = tile_x_ql;
1894
+ *x_dm = tile_x_dm;
1895
+ }
1896
+
1897
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
1898
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1899
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1900
+
1901
+ __builtin_assume(i_offset >= 0);
1902
+ __builtin_assume(i_offset < 8);
1903
+ __builtin_assume(k >= 0);
1904
+ __builtin_assume(k < WARP_SIZE);
1905
+
1906
+ const int kbx = k / QI5_1;
1907
+ const int kqsx = k % QI5_1;
1908
+
1909
+ const block_q5_1 * bx0 = (block_q5_1 *) vx;
1910
+
1911
+ #pragma unroll
1912
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1913
+ int i = i0 + i_offset;
1914
+
1915
+ if (need_check) {
1916
+ i = min(i, i_max);
1917
+ }
1918
+
1919
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
1920
+
1921
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
1922
+ const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
1923
+
1924
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
1925
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
1926
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
1927
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
1928
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
1929
+
1930
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
1931
+
1932
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
1933
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
1934
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
1935
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
1936
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
1937
+
1938
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
1939
+ }
1940
+
1941
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
1942
+ const int kbxd = k % blocks_per_tile_x_row;
1943
+
1944
+ #pragma unroll
1945
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
1946
+ int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
1947
+
1948
+ if (need_check) {
1949
+ i = min(i, i_max);
1950
+ }
1951
+
1952
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
1953
+
1954
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
1955
+ }
1956
+ }
1957
+
1958
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1959
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1960
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1961
+
1962
+ __builtin_assume(i >= 0);
1963
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1964
+ __builtin_assume(j >= 0);
1965
+ __builtin_assume(j < WARP_SIZE);
1966
+ __builtin_assume(k >= 0);
1967
+ __builtin_assume(k < WARP_SIZE);
1968
+
1969
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1970
+ const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
1971
+
1972
+ int u[2*VDR_Q5_1_Q8_1_MMQ];
1973
+
1974
+ #pragma unroll
1975
+ for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
1976
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1977
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
1978
+ }
1979
+
1980
+ return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
1981
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1982
+ }
1983
+
1984
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1985
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1986
+
1987
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1988
+
1989
+ int v[VDR_Q8_0_Q8_1_MMVQ];
1990
+ int u[VDR_Q8_0_Q8_1_MMVQ];
1991
+
1992
+ for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
1993
+ v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
1994
+ u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1995
+ }
1996
+
1997
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
1998
+ }
1999
+
2000
+ static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2001
+
2002
+ __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2003
+ __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
2004
+
2005
+ *x_ql = tile_x_qs;
2006
+ *x_dm = (half2 *) tile_x_d;
2007
+ }
2008
+
2009
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2010
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2011
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2012
+
2013
+ __builtin_assume(i_offset >= 0);
2014
+ __builtin_assume(i_offset < 8);
2015
+ __builtin_assume(k >= 0);
2016
+ __builtin_assume(k < WARP_SIZE);
2017
+
2018
+ const int kbx = k / QI8_0;
2019
+ const int kqsx = k % QI8_0;
2020
+ float * x_dmf = (float *) x_dm;
2021
+
2022
+ const block_q8_0 * bx0 = (block_q8_0 *) vx;
2023
+
2024
+ #pragma unroll
2025
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2026
+ int i = i0 + i_offset;
2027
+
2028
+ if (need_check) {
2029
+ i = min(i, i_max);
2030
+ }
2031
+
2032
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
2033
+
2034
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
2035
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
2036
+ }
2037
+
2038
+ // const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2039
+ // const int kbxd = k % blocks_per_tile_x_row;
2040
+
2041
+ // #pragma unroll
2042
+ // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
2043
+ // FIXME out-of-bounds
2044
+ // const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2045
+
2046
+ // #if GGML_CUDA_MMQ_Y < 64
2047
+ // if (i >= GGML_CUDA_MMQ_Y) {
2048
+ // return;
2049
+ // }
2050
+ // #endif // GGML_CUDA_MMQ_Y < 64
2051
+
2052
+ // const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2053
+
2054
+ // x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
2055
+ // }
2056
+ }
2057
+
2058
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2059
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2060
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2061
+
2062
+ __builtin_assume(i >= 0);
2063
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2064
+ __builtin_assume(j >= 0);
2065
+ __builtin_assume(j < WARP_SIZE);
2066
+ __builtin_assume(k >= 0);
2067
+ __builtin_assume(k < WARP_SIZE);
2068
+
2069
+ const float * x_dmf = (float *) x_dm;
2070
+
2071
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
2072
+ (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
2073
+ y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2074
+ }
2075
+
2076
+ #define VDR_q2_K_q8_1 1
2077
+
2078
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
2079
+ const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2080
+ const half2 & dm, const float * __restrict__ d8) {
2081
+
2082
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2083
+ float sumf_d = 0.0f;
2084
+ float sumf_m = 0.0f;
2085
+
2086
+ for (int i = 0; i < QR2_K; ++i) {
2087
+ const int sc = scales[2*i];
2088
+
2089
+ const int vi = (v >> (2*i)) & 0x03030303;
2090
+
2091
+ sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
2092
+
2093
+ int sc_high = sc >> 4;
2094
+ sc_high |= sc_high << 8;
2095
+ sc_high |= sc_high << 16;
2096
+ sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
2097
+ }
2098
+
2099
+ const float2 dmf = __half22float2(dm);
2100
+
2101
+ return dmf.x*sumf_d - dmf.y*sumf_m;
2102
+ #else
2103
+ return 0.0f; // only to satisfy the compiler
2104
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2105
+ }
2106
+
2107
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2108
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2109
+
2110
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
2111
+
2112
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
2113
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2114
+
2115
+ const uint8_t * scales = bq2_K->scales + scale_offset;
2116
+
2117
+ const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
2118
+ int u[QR2_K];
2119
+ float d8[QR2_K];
2120
+
2121
+ for (int i = 0; i < QR2_K; ++ i) {
2122
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2123
+ d8[i] = bq8_1[bq8_offset + i].ds.x;
2124
+ }
2125
+
2126
+ return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8);
2127
+ }
2128
+
2129
+ static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2130
+
2131
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2132
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
2133
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2134
+
2135
+ *x_ql = tile_x_ql;
2136
+ *x_dm = tile_x_dm;
2137
+ *x_sc = tile_x_sc;
2138
+ }
2139
+
2140
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2141
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2142
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2143
+
2144
+ __builtin_assume(i_offset >= 0);
2145
+ __builtin_assume(i_offset < 8);
2146
+ __builtin_assume(k >= 0);
2147
+ __builtin_assume(k < WARP_SIZE);
2148
+
2149
+ const int kbx = k / QI2_K;
2150
+ const int kqsx = k % QI2_K;
2151
+
2152
+ const block_q2_K * bx0 = (block_q2_K *) vx;
2153
+
2154
+ #pragma unroll
2155
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2156
+ int i = i0 + i_offset;
2157
+
2158
+ if (need_check) {
2159
+ i = min(i, i_max);
2160
+ }
2161
+
2162
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
2163
+
2164
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2165
+ }
2166
+
2167
+ const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
2168
+ const int kbxd = k % blocks_per_tile_x_row;
2169
+
2170
+ #pragma unroll
2171
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
2172
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2173
+
2174
+ if (need_check) {
2175
+ i = min(i, i_max);
2176
+ }
2177
+
2178
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
2179
+
2180
+ x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
2181
+ }
2182
+
2183
+ #pragma unroll
2184
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2185
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2186
+
2187
+ if (need_check) {
2188
+ i = min(i, i_max);
2189
+ }
2190
+
2191
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
2192
+
2193
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
2194
+ }
2195
+ }
2196
+
2197
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2198
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2199
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2200
+
2201
+ __builtin_assume(i >= 0);
2202
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2203
+ __builtin_assume(j >= 0);
2204
+ __builtin_assume(j < WARP_SIZE);
2205
+ __builtin_assume(k >= 0);
2206
+ __builtin_assume(k < WARP_SIZE);
2207
+
2208
+ const int kbx = k / QI2_K;
2209
+ const int kqsx = k % QI2_K;
2210
+
2211
+ const int bq8_offset = QR2_K * (kqsx / QI8_1);
2212
+ const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2213
+
2214
+ const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset;
2215
+
2216
+ int u[QR2_K];
2217
+ float d8[QR2_K];
2218
+
2219
+ for (int l = 0; l < QR2_K; ++ l) {
2220
+ const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2221
+ u[l] = y_qs[y_qs_index];
2222
+ d8[l] = y_ds[y_qs_index / QI8_1].x;
2223
+ }
2224
+
2225
+ return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8);
2226
+ }
2227
+
2228
+ #define VDR_q3_K_q8_1 1
2229
+
2230
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
2231
+ const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2232
+ const int & scale_offset, const float & d, const float * __restrict__ d8) {
2233
+
2234
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2235
+ float sumf = 0.0f;
2236
+
2237
+ for (int i = 0; i < QR3_K; ++i) {
2238
+ const int isc = scale_offset + 2*i;
2239
+
2240
+ const int isc_low = isc % (QK_K/32);
2241
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
2242
+ const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
2243
+
2244
+ const int isc_high = isc % (QK_K/64);
2245
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
2246
+ const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
2247
+
2248
+ const int sc = (sc_low | sc_high) - 32;
2249
+
2250
+ const int vil = (vl >> (2*i)) & 0x03030303;
2251
+
2252
+ const int vih = ((vh >> i) << 2) & 0x04040404;
2253
+
2254
+ const int vi = __vsubss4(vil, vih);
2255
+
2256
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2257
+ }
2258
+
2259
+ return d*sumf;
2260
+ #else
2261
+ return 0.0f; // only to satisfy the compiler
2262
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2263
+ }
2264
+
2265
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2266
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2267
+
2268
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
2269
+
2270
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
2271
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2272
+
2273
+ const float d = bq3_K->d;
2274
+
2275
+ const int vl = get_int_from_uint8(bq3_K->qs, iqs);
2276
+
2277
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2278
+ const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
2279
+
2280
+ int u[QR3_K];
2281
+ float d8[QR3_K];
2282
+
2283
+ for (int i = 0; i < QR3_K; ++i) {
2284
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2285
+ d8[i] = bq8_1[bq8_offset + i].ds.x;
2286
+ }
2287
+
2288
+ return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2289
+ }
2290
+
2291
+ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2292
+
2293
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2294
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
2295
+ __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2296
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2297
+
2298
+ *x_ql = tile_x_ql;
2299
+ *x_dm = tile_x_dm;
2300
+ *x_qh = tile_x_qh;
2301
+ *x_sc = tile_x_sc;
2302
+ }
2303
+
2304
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2305
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2306
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2307
+
2308
+ __builtin_assume(i_offset >= 0);
2309
+ __builtin_assume(i_offset < 8);
2310
+ __builtin_assume(k >= 0);
2311
+ __builtin_assume(k < WARP_SIZE);
2312
+
2313
+ const int kbx = k / QI3_K;
2314
+ const int kqsx = k % QI3_K;
2315
+
2316
+ const block_q3_K * bx0 = (block_q3_K *) vx;
2317
+
2318
+ #pragma unroll
2319
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2320
+ int i = i0 + i_offset;
2321
+
2322
+ if (need_check) {
2323
+ i = min(i, i_max);
2324
+ }
2325
+
2326
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
2327
+
2328
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
2329
+ }
2330
+
2331
+ const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
2332
+ const int kbxd = k % blocks_per_tile_x_row;
2333
+
2334
+ #pragma unroll
2335
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
2336
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2337
+
2338
+ if (need_check) {
2339
+ i = min(i, i_max);
2340
+ }
2341
+
2342
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
2343
+
2344
+ x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d;
2345
+ }
2346
+
2347
+ #pragma unroll
2348
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
2349
+ int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
2350
+
2351
+ if (need_check) {
2352
+ i = min(i, i_max);
2353
+ }
2354
+
2355
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
2356
+
2357
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2358
+ }
2359
+
2360
+ #pragma unroll
2361
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2362
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2363
+
2364
+ if (need_check) {
2365
+ i = min(i, i_max);
2366
+ }
2367
+
2368
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
2369
+
2370
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4));
2371
+ }
2372
+ }
2373
+
2374
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2375
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2376
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2377
+
2378
+ __builtin_assume(i >= 0);
2379
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2380
+ __builtin_assume(j >= 0);
2381
+ __builtin_assume(j < WARP_SIZE);
2382
+ __builtin_assume(k >= 0);
2383
+ __builtin_assume(k < WARP_SIZE);
2384
+
2385
+ const int kbx = k / QI3_K;
2386
+ const int kqsx = k % QI3_K;
2387
+
2388
+ const int bq8_offset = QR3_K * (kqsx / (QI3_K/2));
2389
+ const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2390
+
2391
+ const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
2392
+
2393
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2394
+ const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
2395
+
2396
+ int u[QR3_K];
2397
+ float d8[QR3_K];
2398
+
2399
+ for (int l = 0; l < QR3_K; ++ l) {
2400
+ const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2401
+ u[l] = y_qs[y_qs_index];
2402
+ d8[l] = y_ds[y_qs_index / QI8_1].x;
2403
+ }
2404
+
2405
+ return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
2406
+ x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
2407
+ }
2408
+
2409
+ #define VDR_q4_K_q8_1 2
2410
+
2411
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
2412
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2413
+ const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
2414
+
2415
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2416
+ float sumf_d = 0.0f;
2417
+ float sumf_m = 0.0f;
2418
+
2419
+ for (int i = 0; i < QR4_K; ++i) {
2420
+ const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
2421
+ const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
2422
+
2423
+ const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
2424
+ const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
2425
+
2426
+ sumf_d += d8[i] * (dot1 * sc[i]);
2427
+ sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
2428
+ }
2429
+
2430
+ return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m;
2431
+
2432
+ #else
2433
+ return 0.0f; // only to satisfy the compiler
2434
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2435
+ }
2436
+
2437
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2438
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2439
+
2440
+ #ifndef GGML_QKK_64
2441
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
2442
+
2443
+ int v[2];
2444
+ int u[2*QR4_K];
2445
+ float d8[QR4_K];
2446
+
2447
+ // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
2448
+ const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
2449
+
2450
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
2451
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
2452
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
2453
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
2454
+
2455
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
2456
+ v[0] = q4[0];
2457
+ v[1] = q4[4];
2458
+
2459
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales;
2460
+ uint16_t aux[2];
2461
+ const int j = bq8_offset/2;
2462
+ if (j < 2) {
2463
+ aux[0] = scales[j+0] & 0x3f3f;
2464
+ aux[1] = scales[j+2] & 0x3f3f;
2465
+ } else {
2466
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
2467
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
2468
+ }
2469
+ const uint8_t * sc = (const uint8_t *)aux;
2470
+ const uint8_t * m = sc + 2;
2471
+
2472
+ for (int i = 0; i < QR4_K; ++i) {
2473
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2474
+ d8[i] = bq8i->ds.x;
2475
+
2476
+ const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2477
+ u[2*i+0] = q8[0];
2478
+ u[2*i+1] = q8[4];
2479
+ }
2480
+
2481
+ return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8);
2482
+
2483
+ #else
2484
+
2485
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2486
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
2487
+
2488
+ float sumf_d = 0.0f;
2489
+ float sumf_m = 0.0f;
2490
+
2491
+ uint16_t aux16[2];
2492
+ const uint8_t * s = (const uint8_t *)aux16;
2493
+
2494
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
2495
+ aux16[0] = a[0] & 0x0f0f;
2496
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
2497
+
2498
+ const float dall = bq4_K->d[0];
2499
+ const float dmin = bq4_K->d[1];
2500
+
2501
+ const float d8_1 = bq8_1[0].ds.x;
2502
+ const float d8_2 = bq8_1[1].ds.x;
2503
+
2504
+ const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2505
+ const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
2506
+ const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
2507
+ const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
2508
+
2509
+ const int * q4 = (const int *)bq4_K->qs + (iqs/2);
2510
+ const int v1 = q4[0];
2511
+ const int v2 = q4[4];
2512
+
2513
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
2514
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
2515
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
2516
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
2517
+
2518
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
2519
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
2520
+
2521
+ return dall * sumf_d - dmin * sumf_m;
2522
+
2523
+ #else
2524
+ return 0.0f; // only to satisfy the compiler
2525
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2526
+
2527
+ #endif
2528
+ }
2529
+
2530
+ static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2531
+
2532
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2533
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
2534
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2535
+
2536
+ *x_ql = tile_x_ql;
2537
+ *x_dm = tile_x_dm;
2538
+ *x_sc = tile_x_sc;
2539
+ }
2540
+
2541
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2542
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2543
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2544
+
2545
+ __builtin_assume(i_offset >= 0);
2546
+ __builtin_assume(i_offset < 8);
2547
+ __builtin_assume(k >= 0);
2548
+ __builtin_assume(k < WARP_SIZE);
2549
+
2550
+ const int kbx = k / QI4_K; // == 0 if QK_K == 256
2551
+ const int kqsx = k % QI4_K; // == k if QK_K == 256
2552
+
2553
+ const block_q4_K * bx0 = (block_q4_K *) vx;
2554
+
2555
+ #pragma unroll
2556
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2557
+ int i = i0 + i_offset;
2558
+
2559
+ if (need_check) {
2560
+ i = min(i, i_max);
2561
+ }
2562
+
2563
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
2564
+
2565
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2566
+ }
2567
+
2568
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2569
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2570
+
2571
+ #pragma unroll
2572
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
2573
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2574
+
2575
+ if (need_check) {
2576
+ i = min(i, i_max);
2577
+ }
2578
+
2579
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2580
+
2581
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2582
+ }
2583
+
2584
+ #pragma unroll
2585
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2586
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2587
+
2588
+ if (need_check) {
2589
+ i = min(i, i_max);
2590
+ }
2591
+
2592
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2593
+
2594
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
2595
+ }
2596
+ }
2597
+
2598
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2599
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2600
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2601
+
2602
+ __builtin_assume(i >= 0);
2603
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2604
+ __builtin_assume(j >= 0);
2605
+ __builtin_assume(j < WARP_SIZE);
2606
+ __builtin_assume(k >= 0);
2607
+ __builtin_assume(k < WARP_SIZE);
2608
+
2609
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
2610
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
2611
+
2612
+ int v[2];
2613
+ int u[2*QR4_K];
2614
+ float d8[QR4_K];
2615
+
2616
+ // kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6
2617
+ const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
2618
+
2619
+ v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2620
+ v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2621
+
2622
+ const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2623
+ uint16_t aux[2];
2624
+ const int l = bq8_offset/2;
2625
+ if (l < 2) {
2626
+ aux[0] = scales[l+0] & 0x3f3f;
2627
+ aux[1] = scales[l+2] & 0x3f3f;
2628
+ } else {
2629
+ aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2630
+ aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2631
+ }
2632
+ const uint8_t * sc = (const uint8_t *)aux;
2633
+ const uint8_t * m = sc + 2;
2634
+
2635
+ for (int l = 0; l < QR4_K; ++l) {
2636
+ const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2637
+ u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2638
+ u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2639
+ d8[l] = y_ds[kqsy / QI8_1].x;
2640
+ }
2641
+
2642
+ return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
2643
+ }
2644
+
2645
+ #define VDR_q5_K_q8_1 2
2646
+
2647
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
2648
+ const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2649
+ const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
2650
+
2651
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2652
+ float sumf_d = 0.0f;
2653
+ float sumf_m = 0.0f;
2654
+
2655
+ for (int i = 0; i < QR5_K; ++i) {
2656
+ const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
2657
+ const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
2658
+
2659
+ const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
2660
+ const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
2661
+
2662
+ const int v0i = vl0i | vh0i;
2663
+ const int v1i = vl1i | vh1i;
2664
+
2665
+ const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
2666
+ const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
2667
+
2668
+ sumf_d += d8[i] * (dot1 * sc[i]);
2669
+ sumf_m += d8[i] * (dot2 * m[i]);
2670
+
2671
+ }
2672
+
2673
+ return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m;
2674
+
2675
+ #else
2676
+ return 0.0f; // only to satisfy the compiler
2677
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2678
+ }
2679
+
2680
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2681
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2682
+
2683
+ #ifndef GGML_QKK_64
2684
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
2685
+
2686
+ int vl[2];
2687
+ int vh[2];
2688
+ int u[2*QR5_K];
2689
+ float d8[QR5_K];
2690
+
2691
+ const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
2692
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
2693
+ const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
2694
+
2695
+ vl[0] = ql[0];
2696
+ vl[1] = ql[4];
2697
+
2698
+ vh[0] = qh[0] >> bq8_offset;
2699
+ vh[1] = qh[4] >> bq8_offset;
2700
+
2701
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales;
2702
+ uint16_t aux[2];
2703
+ const int j = bq8_offset/2;
2704
+ if (j < 2) {
2705
+ aux[0] = scales[j+0] & 0x3f3f;
2706
+ aux[1] = scales[j+2] & 0x3f3f;
2707
+ } else {
2708
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
2709
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
2710
+ }
2711
+ const uint8_t * sc = (const uint8_t *)aux;
2712
+ const uint8_t * m = sc + 2;
2713
+
2714
+ for (int i = 0; i < QR5_K; ++i) {
2715
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2716
+ d8[i] = bq8i->ds.x;
2717
+
2718
+ const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2719
+ u[2*i+0] = q8[0];
2720
+ u[2*i+1] = q8[4];
2721
+ }
2722
+
2723
+ return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
2724
+
2725
+ #else
2726
+
2727
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2728
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
2729
+
2730
+ const int8_t * s = bq5_K->scales;
2731
+
2732
+ const float d = bq5_K->d;
2733
+
2734
+ const float d8_1 = bq8_1[0].ds.x;
2735
+ const float d8_2 = bq8_1[1].ds.x;
2736
+
2737
+ const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2738
+ const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
2739
+ const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
2740
+ const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
2741
+
2742
+ const int * ql = (const int *)bq5_K->qs + (iqs/2);
2743
+ const int vl1 = ql[0];
2744
+ const int vl2 = ql[4];
2745
+
2746
+ const int step = 4 * (iqs/2); // 0, 4, 8, 12
2747
+ const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
2748
+ const int in = step%8; // 0, 4, 0, 4
2749
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
2750
+
2751
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
2752
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
2753
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
2754
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
2755
+
2756
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
2757
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
2758
+
2759
+ return d * sumf_d;
2760
+
2761
+ #else
2762
+ return 0.0f; // only to satisfy the compiler
2763
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2764
+
2765
+ #endif
2766
+ }
2767
+
2768
+ static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2769
+
2770
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2771
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
2772
+ __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2773
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2774
+
2775
+ *x_ql = tile_x_ql;
2776
+ *x_dm = tile_x_dm;
2777
+ *x_qh = tile_x_qh;
2778
+ *x_sc = tile_x_sc;
2779
+ }
2780
+
2781
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2782
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2783
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2784
+
2785
+ __builtin_assume(i_offset >= 0);
2786
+ __builtin_assume(i_offset < 8);
2787
+ __builtin_assume(k >= 0);
2788
+ __builtin_assume(k < WARP_SIZE);
2789
+
2790
+ const int kbx = k / QI5_K; // == 0 if QK_K == 256
2791
+ const int kqsx = k % QI5_K; // == k if QK_K == 256
2792
+
2793
+ const block_q5_K * bx0 = (block_q5_K *) vx;
2794
+
2795
+ #pragma unroll
2796
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2797
+ int i = i0 + i_offset;
2798
+
2799
+ if (need_check) {
2800
+ i = min(i, i_max);
2801
+ }
2802
+
2803
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
2804
+
2805
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2806
+ }
2807
+
2808
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
2809
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2810
+
2811
+ #pragma unroll
2812
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
2813
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2814
+
2815
+ if (need_check) {
2816
+ i = min(i, i_max);
2817
+ }
2818
+
2819
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
2820
+
2821
+ x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
2822
+ }
2823
+
2824
+ #pragma unroll
2825
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2826
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2827
+
2828
+ if (need_check) {
2829
+ i = min(i, i_max);
2830
+ }
2831
+
2832
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
2833
+
2834
+ x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
2835
+ }
2836
+
2837
+ #pragma unroll
2838
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2839
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2840
+
2841
+ if (need_check) {
2842
+ i = min(i, i_max);
2843
+ }
2844
+
2845
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2846
+
2847
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
2848
+ }
2849
+ }
2850
+
2851
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
2852
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2853
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2854
+
2855
+ __builtin_assume(i >= 0);
2856
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2857
+ __builtin_assume(j >= 0);
2858
+ __builtin_assume(j < WARP_SIZE);
2859
+ __builtin_assume(k >= 0);
2860
+ __builtin_assume(k < WARP_SIZE);
2861
+
2862
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
2863
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
2864
+
2865
+ int vl[2];
2866
+ int vh[2];
2867
+ int u[2*QR4_K];
2868
+ float d8[QR4_K];
2869
+
2870
+ const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
2871
+
2872
+ vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2873
+ vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2874
+
2875
+ vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
2876
+ vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
2877
+
2878
+ const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2879
+ uint16_t aux[2];
2880
+ const int l = bq8_offset/2;
2881
+ if (l < 2) {
2882
+ aux[0] = scales[l+0] & 0x3f3f;
2883
+ aux[1] = scales[l+2] & 0x3f3f;
2884
+ } else {
2885
+ aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2886
+ aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2887
+ }
2888
+ const uint8_t * sc = (const uint8_t *)aux;
2889
+ const uint8_t * m = sc + 2;
2890
+
2891
+ for (int l = 0; l < QR5_K; ++l) {
2892
+ const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2893
+ u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2894
+ u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2895
+ d8[l] = y_ds[kqsy / QI8_1].x;
2896
+ }
2897
+
2898
+ return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
2899
+ }
2900
+
2901
+ #define VDR_q6_K_q8_1 1
2902
+
2903
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
2904
+ const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
2905
+ const float & d, const float * __restrict__ d8) {
2906
+
2907
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2908
+ float sumf = 0.0f;
2909
+
2910
+ for (int i = 0; i < QR6_K; ++i) {
2911
+ const int sc = scales[4*i];
2912
+
2913
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
2914
+
2915
+ const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
2916
+
2917
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
2918
+
2919
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2920
+ }
2921
+
2922
+ return d*sumf;
2923
+ #else
2924
+ return 0.0f; // only to satisfy the compiler
2925
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2926
+ }
2927
+
2928
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
2929
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2930
+
2931
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
2932
+
2933
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
2934
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
2935
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
1601
2936
 
1602
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1603
- const float d8i = bq8i->d;
1604
- const int * q8 = (const int *)bq8i->qs + (iqs%4);
1605
- const int ui1 = q8[0];
1606
- const int ui2 = q8[4];
2937
+ const int vl = get_int_from_uint8(bq6_K->ql, iqs);
2938
+ const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
1607
2939
 
1608
- const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
1609
- const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
2940
+ const int8_t * scales = bq6_K->scales + scale_offset;
1610
2941
 
1611
- const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1612
- const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
2942
+ int u[QR6_K];
2943
+ float d8[QR6_K];
1613
2944
 
1614
- sumf_d += d8i * (dot1 * sc[i]);
1615
- sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
2945
+ for (int i = 0; i < QR6_K; ++i) {
2946
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
2947
+ d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
1616
2948
  }
1617
2949
 
1618
- return d*sumf_d - dmin*sumf_m;
2950
+ return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8);
2951
+ }
1619
2952
 
1620
- #else
2953
+ static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1621
2954
 
1622
- uint16_t aux16[2];
1623
- const uint8_t * s = (const uint8_t *)aux16;
2955
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2956
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
2957
+ __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2958
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
1624
2959
 
1625
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
1626
- aux16[0] = a[0] & 0x0f0f;
1627
- aux16[1] = (a[0] >> 4) & 0x0f0f;
2960
+ *x_ql = tile_x_ql;
2961
+ *x_dm = tile_x_dm;
2962
+ *x_qh = tile_x_qh;
2963
+ *x_sc = tile_x_sc;
2964
+ }
1628
2965
 
1629
- const float dall = bq4_K->d[0];
1630
- const float dmin = bq4_K->d[1];
2966
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
2967
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2968
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1631
2969
 
1632
- const float d8_1 = bq8_1[0].d;
1633
- const float d8_2 = bq8_1[1].d;
2970
+ __builtin_assume(i_offset >= 0);
2971
+ __builtin_assume(i_offset < 8);
2972
+ __builtin_assume(k >= 0);
2973
+ __builtin_assume(k < WARP_SIZE);
1634
2974
 
1635
- const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1636
- const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1637
- const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1638
- const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
2975
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
2976
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
1639
2977
 
1640
- const int * q4 = (const int *)bq4_K->qs + iqs;
1641
- const int v1 = q4[0];
1642
- const int v2 = q4[4];
2978
+ const block_q6_K * bx0 = (block_q6_K *) vx;
1643
2979
 
1644
- const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
1645
- const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
1646
- const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1647
- const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
2980
+ #pragma unroll
2981
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2982
+ int i = i0 + i_offset;
1648
2983
 
1649
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
1650
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
2984
+ if (need_check) {
2985
+ i = min(i, i_max);
2986
+ }
1651
2987
 
1652
- return dall * sumf_d - dmin * sumf_m;
2988
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
1653
2989
 
1654
- #endif
2990
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx);
2991
+ }
1655
2992
 
1656
- #else
1657
- return 0.0f; // only to satisfy the compiler
1658
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1659
- }
2993
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
2994
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
1660
2995
 
1661
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1662
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
2996
+ #pragma unroll
2997
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
2998
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
1663
2999
 
1664
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1665
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
3000
+ if (need_check) {
3001
+ i = min(i, i_max);
3002
+ }
1666
3003
 
1667
- #ifndef GGML_QKK_64
3004
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
1668
3005
 
1669
- const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
1670
- const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1671
- const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
3006
+ x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d;
3007
+ }
1672
3008
 
1673
- float sumf_d = 0.0f;
1674
- float sumf_m = 0.0f;
3009
+ #pragma unroll
3010
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
3011
+ int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
1675
3012
 
1676
- const float d = bq5_K->d;
1677
- const float dmin = bq5_K->dmin;
3013
+ if (need_check) {
3014
+ i = min(i, i_max);
3015
+ }
1678
3016
 
1679
- const int vl1 = ql[0];
1680
- const int vl2 = ql[4];
3017
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2);
1681
3018
 
1682
- const int vh1 = qh[0] >> bq8_offset;
1683
- const int vh2 = qh[4] >> bq8_offset;
3019
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2));
3020
+ }
1684
3021
 
1685
- const uint16_t * scales = (const uint16_t *)bq5_K->scales;
1686
- uint16_t aux[2];
1687
- const int j = bq8_offset/2;
1688
- if (j < 2) {
1689
- aux[0] = scales[j+0] & 0x3f3f;
1690
- aux[1] = scales[j+2] & 0x3f3f;
1691
- } else {
1692
- aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1693
- aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
3022
+ #pragma unroll
3023
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
3024
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
3025
+
3026
+ if (need_check) {
3027
+ i = min(i, i_max);
3028
+ }
3029
+
3030
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
3031
+
3032
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
1694
3033
  }
1695
- const uint8_t * sc = (const uint8_t *)aux;
1696
- const uint8_t * m = sc + 2;
3034
+ }
1697
3035
 
1698
- for (int i = 0; i < QR5_K; ++i) {
3036
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3037
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3038
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1699
3039
 
1700
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1701
- const float d8i = bq8i->d;
1702
- const int * q8 = (const int *)bq8i->qs + (iqs%4);
1703
- const int ui1 = q8[0];
1704
- const int ui2 = q8[4];
3040
+ __builtin_assume(i >= 0);
3041
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
3042
+ __builtin_assume(j >= 0);
3043
+ __builtin_assume(j < WARP_SIZE);
3044
+ __builtin_assume(k >= 0);
3045
+ __builtin_assume(k < WARP_SIZE);
1705
3046
 
1706
- const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
1707
- const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
3047
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
3048
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
1708
3049
 
1709
- const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
1710
- const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
3050
+ const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
3051
+ const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
3052
+ const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
1711
3053
 
1712
- const int vi1 = vil1 | vih1;
1713
- const int vi2 = vil2 | vih2;
3054
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
1714
3055
 
1715
- const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1716
- const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
3056
+ const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
3057
+ const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
1717
3058
 
1718
- sumf_d += d8i * (dot1 * sc[i]);
1719
- sumf_m += d8i * (dot2 * m[i]);
3059
+ int u[QR6_K];
3060
+ float d8[QR6_K];
1720
3061
 
3062
+ for (int l = 0; l < QR6_K; ++l) {
3063
+ const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
3064
+ u[l] = y_qs[kqsy];
3065
+ d8[l] = y_ds[kqsy / QI8_1].x;
1721
3066
  }
1722
3067
 
1723
- return d*sumf_d - dmin*sumf_m;
1724
-
1725
- #else
3068
+ return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
3069
+ x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
3070
+ }
1726
3071
 
1727
- const int8_t * s = bq5_K->scales;
3072
+ template <int qk, int qr, int qi, typename block_q_t,
3073
+ allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3074
+ static __global__ void mul_mat_q(
3075
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3076
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1728
3077
 
1729
- const float d = bq5_K->d;
3078
+ const block_q_t * x = (const block_q_t *) vx;
3079
+ const block_q8_1 * y = (const block_q8_1 *) vy;
1730
3080
 
1731
- const float d8_1 = bq8_1[0].d;
1732
- const float d8_2 = bq8_1[1].d;
3081
+ const int blocks_per_row_x = ncols_x / qk;
3082
+ const int blocks_per_col_y = nrows_y / QK8_1;
3083
+ const int blocks_per_warp = WARP_SIZE / qi;
1733
3084
 
1734
- const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1735
- const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1736
- const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1737
- const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
3085
+ const int & ncols_dst = ncols_y;
1738
3086
 
1739
- const int * ql = (const int *)bq5_K->qs + iqs;
1740
- const int vl1 = ql[0];
1741
- const int vl2 = ql[4];
3087
+ const int tid_x = threadIdx.x;
3088
+ const int tid_y = threadIdx.y;
1742
3089
 
1743
- const int step = 4 * iqs; // 0, 4, 8, 12
1744
- const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
1745
- const int in = step%8; // 0, 4, 0, 4
1746
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
3090
+ const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
3091
+ const int & row_x_0 = row_dst_0;
3092
+ const int row_dst = row_dst_0 + tid_x;
1747
3093
 
1748
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
1749
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
1750
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
1751
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
3094
+ const int col_dst_0 = blockIdx.y*WARP_SIZE;
3095
+ const int & col_y_0 = col_dst_0;
1752
3096
 
1753
- const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
1754
- + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
3097
+ int * tile_x_ql = nullptr;
3098
+ half2 * tile_x_dm = nullptr;
3099
+ int * tile_x_qh = nullptr;
3100
+ int * tile_x_sc = nullptr;
1755
3101
 
1756
- return d * sumf_d;
3102
+ allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
1757
3103
 
1758
- #endif
3104
+ const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
1759
3105
 
1760
- #else
1761
- return 0.0f; // only to satisfy the compiler
1762
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1763
- }
3106
+ __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
3107
+ __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
1764
3108
 
1765
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1766
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
3109
+ float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
1767
3110
 
1768
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1769
- const block_q6_K * bq6_K = (const block_q6_K *) vbq;
3111
+ for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
1770
3112
 
1771
- const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1772
- const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1773
- const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
3113
+ load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
3114
+ tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
1774
3115
 
1775
- float sumf = 0.0f;
3116
+ for (int ir = 0; ir < qr; ++ir) {
3117
+ const int kqs = ir*WARP_SIZE + tid_x;
3118
+ const int kbxd = kqs / QI8_1;
1776
3119
 
1777
- const float d = bq6_K->d;
3120
+ for (int i = 0; i < WARP_SIZE; i += 8) {
3121
+ const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
1778
3122
 
1779
- int vl;
1780
- memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
3123
+ const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
1781
3124
 
1782
- int vh;
1783
- memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
3125
+ tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
3126
+ }
3127
+ }
1784
3128
 
1785
- for (int i = 0; i < QR6_K; ++i) {
1786
- const int sc = bq6_K->scales[scale_offset + 4*i];
3129
+ for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
3130
+ const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
3131
+ const int kby = tid_x % blocks_per_tile_y_col;
3132
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3133
+ tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
3134
+ }
1787
3135
 
1788
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
1789
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
1790
- const float d8i = bq8i->d;
3136
+ __syncthreads();
1791
3137
 
1792
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
3138
+ #if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
3139
+ #pragma unroll
3140
+ #endif // __CUDA_ARCH__ >= 700
3141
+ for (int k = 0; k < WARP_SIZE; k += vdr) {
3142
+ #pragma unroll
3143
+ for (int j = 0; j < WARP_SIZE; j += 8) {
3144
+ #pragma unroll
3145
+ for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3146
+ sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3147
+ tid_x + i, tid_y + j, k);
3148
+ }
3149
+ }
3150
+ }
1793
3151
 
1794
- const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
3152
+ __syncthreads();
3153
+ }
1795
3154
 
1796
- const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1797
3155
 
1798
- sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
3156
+ if (row_dst >= nrows_dst) {
3157
+ return;
1799
3158
  }
1800
3159
 
1801
- return d*sumf;
1802
- #else
1803
- return 0.0f; // only to satisfy the compiler
1804
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3160
+ for (int j = 0; j < WARP_SIZE; j += 8) {
3161
+ const int col_dst = col_dst_0 + j + tid_y;
3162
+
3163
+ if (col_dst >= ncols_dst) {
3164
+ return;
3165
+ }
3166
+
3167
+ for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3168
+ dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
3169
+ }
3170
+ }
1805
3171
  }
1806
3172
 
1807
- template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
3173
+ template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
1808
3174
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1809
3175
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1810
3176
 
@@ -1813,7 +3179,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1813
3179
  }
1814
3180
 
1815
3181
  const int blocks_per_row = ncols / qk;
1816
- const int blocks_per_warp = WARP_SIZE / qi;
3182
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
1817
3183
 
1818
3184
  // partial sum for each thread
1819
3185
  float tmp = 0.0f;
@@ -1822,11 +3188,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1822
3188
  const block_q8_1 * y = (const block_q8_1 *) vy;
1823
3189
 
1824
3190
  for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1825
- const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
3191
+ const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
1826
3192
 
1827
- const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
3193
+ const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
1828
3194
 
1829
- const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
3195
+ const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
1830
3196
 
1831
3197
  tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
1832
3198
  }
@@ -1859,11 +3225,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1859
3225
  const int y_offset = qr == 1 ? 1 : qk/2;
1860
3226
 
1861
3227
  // partial sum for each thread
1862
- #ifdef GGML_CUDA_DMMV_F16
3228
+ #ifdef GGML_CUDA_F16
1863
3229
  half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
1864
3230
  #else
1865
3231
  float tmp = 0.0f;
1866
- #endif // GGML_CUDA_DMMV_F16
3232
+ #endif // GGML_CUDA_F16
1867
3233
 
1868
3234
  for (int i = 0; i < ncols; i += iter_stride) {
1869
3235
  const int col = i + vals_per_iter*tid;
@@ -1883,7 +3249,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1883
3249
 
1884
3250
  // matrix multiplication
1885
3251
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
1886
- #ifdef GGML_CUDA_DMMV_F16
3252
+ #ifdef GGML_CUDA_F16
1887
3253
  tmp += __hmul2(v, {
1888
3254
  y[iybs + iqs + j/qr + 0],
1889
3255
  y[iybs + iqs + j/qr + y_offset]
@@ -1891,7 +3257,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1891
3257
  #else
1892
3258
  tmp += v.x * y[iybs + iqs + j/qr + 0];
1893
3259
  tmp += v.y * y[iybs + iqs + j/qr + y_offset];
1894
- #endif // GGML_CUDA_DMMV_F16
3260
+ #endif // GGML_CUDA_F16
1895
3261
  }
1896
3262
  }
1897
3263
 
@@ -1902,11 +3268,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1902
3268
  }
1903
3269
 
1904
3270
  if (tid == 0) {
1905
- #ifdef GGML_CUDA_DMMV_F16
3271
+ #ifdef GGML_CUDA_F16
1906
3272
  dst[row] = tmp.x + tmp.y;
1907
3273
  #else
1908
3274
  dst[row] = tmp;
1909
- #endif // GGML_CUDA_DMMV_F16
3275
+ #endif // GGML_CUDA_F16
1910
3276
  }
1911
3277
  }
1912
3278
 
@@ -2046,7 +3412,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
2046
3412
  }
2047
3413
 
2048
3414
  // rope == RoPE == rotary positional embedding
2049
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
3415
+ static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3416
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
2050
3417
  const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
2051
3418
 
2052
3419
  if (col >= ncols) {
@@ -2056,7 +3423,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
2056
3423
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
2057
3424
  const int i = row*ncols + col;
2058
3425
 
2059
- const float theta = p*powf(theta_scale, col/2);
3426
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
2060
3427
  const float sin_theta = sinf(theta);
2061
3428
  const float cos_theta = cosf(theta);
2062
3429
 
@@ -2203,9 +3570,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
2203
3570
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
2204
3571
  }
2205
3572
 
2206
- static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
2207
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
2208
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
3573
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
3574
+ const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
3575
+ const dim3 num_blocks(block_num_x, ky, 1);
3576
+ const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
3577
+ quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
2209
3578
  }
2210
3579
 
2211
3580
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2366,7 +3735,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
2366
3735
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2367
3736
  const dim3 block_nums(1, block_num_y, 1);
2368
3737
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2369
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
3738
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
2370
3739
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2371
3740
  }
2372
3741
 
@@ -2375,7 +3744,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
2375
3744
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2376
3745
  const dim3 block_nums(1, block_num_y, 1);
2377
3746
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2378
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
3747
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
2379
3748
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2380
3749
  }
2381
3750
 
@@ -2384,7 +3753,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
2384
3753
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2385
3754
  const dim3 block_nums(1, block_num_y, 1);
2386
3755
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2387
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
3756
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
2388
3757
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2389
3758
  }
2390
3759
 
@@ -2393,7 +3762,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
2393
3762
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2394
3763
  const dim3 block_nums(1, block_num_y, 1);
2395
3764
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2396
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
3765
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
2397
3766
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2398
3767
  }
2399
3768
 
@@ -2402,7 +3771,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
2402
3771
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2403
3772
  const dim3 block_nums(1, block_num_y, 1);
2404
3773
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2405
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
3774
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
2406
3775
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2407
3776
  }
2408
3777
 
@@ -2411,7 +3780,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
2411
3780
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2412
3781
  const dim3 block_nums(1, block_num_y, 1);
2413
3782
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2414
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
3783
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1>
2415
3784
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2416
3785
  }
2417
3786
 
@@ -2420,7 +3789,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
2420
3789
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2421
3790
  const dim3 block_nums(1, block_num_y, 1);
2422
3791
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2423
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
3792
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1>
2424
3793
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2425
3794
  }
2426
3795
 
@@ -2429,10 +3798,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
2429
3798
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2430
3799
  const dim3 block_nums(1, block_num_y, 1);
2431
3800
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2432
- // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
2433
- // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2434
- // is better amortized.
2435
- mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
3801
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1>
2436
3802
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2437
3803
  }
2438
3804
 
@@ -2441,10 +3807,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
2441
3807
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2442
3808
  const dim3 block_nums(1, block_num_y, 1);
2443
3809
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2444
- // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
2445
- // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2446
- // is better amortized.
2447
- mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
3810
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1>
2448
3811
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2449
3812
  }
2450
3813
 
@@ -2453,7 +3816,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
2453
3816
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2454
3817
  const dim3 block_nums(1, block_num_y, 1);
2455
3818
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2456
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
3819
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1>
2457
3820
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2458
3821
  }
2459
3822
 
@@ -2500,6 +3863,186 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
2500
3863
  }
2501
3864
  }
2502
3865
 
3866
+ static void ggml_mul_mat_q4_0_q8_1_cuda(
3867
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3868
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3869
+
3870
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3871
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3872
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3873
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3874
+
3875
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3876
+ mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3877
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3878
+ } else {
3879
+ mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3880
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3881
+ }
3882
+ }
3883
+
3884
+ static void ggml_mul_mat_q4_1_q8_1_cuda(
3885
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3886
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3887
+
3888
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3889
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3890
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3891
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3892
+
3893
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3894
+ mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3895
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3896
+ } else {
3897
+ mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3898
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3899
+ }
3900
+ }
3901
+
3902
+ static void ggml_mul_mat_q5_0_q8_1_cuda(
3903
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3904
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3905
+
3906
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3907
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3908
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3909
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3910
+
3911
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3912
+ mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3913
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3914
+ } else {
3915
+ mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3916
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3917
+ }
3918
+ }
3919
+
3920
+ static void ggml_mul_mat_q5_1_q8_1_cuda(
3921
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3922
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3923
+
3924
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3925
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3926
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3927
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3928
+
3929
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3930
+ mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3931
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3932
+ } else {
3933
+ mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3934
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3935
+ }
3936
+ }
3937
+
3938
+ static void ggml_mul_mat_q8_0_q8_1_cuda(
3939
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3940
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3941
+
3942
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3943
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3944
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3945
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3946
+
3947
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3948
+ mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3949
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3950
+ } else {
3951
+ mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3952
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3953
+ }
3954
+ }
3955
+
3956
+ static void ggml_mul_mat_q2_K_q8_1_cuda(
3957
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3958
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3959
+
3960
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3961
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3962
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3963
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3964
+
3965
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3966
+ mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3967
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3968
+ } else {
3969
+ mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3970
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3971
+ }
3972
+ }
3973
+
3974
+ static void ggml_mul_mat_q3_K_q8_1_cuda(
3975
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3976
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3977
+
3978
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3979
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3980
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3981
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3982
+
3983
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3984
+ mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3985
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
+ } else {
3987
+ mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3988
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3989
+ }
3990
+ }
3991
+
3992
+ static void ggml_mul_mat_q4_K_q8_1_cuda(
3993
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3994
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3995
+
3996
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3997
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3998
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3999
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4000
+
4001
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4002
+ mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4003
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4004
+ } else {
4005
+ mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4006
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4007
+ }
4008
+ }
4009
+
4010
+ static void ggml_mul_mat_q5_K_q8_1_cuda(
4011
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4012
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4013
+
4014
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4015
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4016
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4017
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4018
+
4019
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4020
+ mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4021
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4022
+ } else {
4023
+ mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4024
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4025
+ }
4026
+ }
4027
+
4028
+ static void ggml_mul_mat_q6_K_q8_1_cuda(
4029
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4030
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4031
+
4032
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4033
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4034
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4035
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4036
+
4037
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4038
+ mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4039
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4040
+ } else {
4041
+ mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4042
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4043
+ }
4044
+ }
4045
+
2503
4046
  static void ggml_mul_mat_p021_f16_f32_cuda(
2504
4047
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
2505
4048
  const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
@@ -2544,12 +4087,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
2544
4087
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
2545
4088
  }
2546
4089
 
2547
- static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
4090
+ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4091
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
2548
4092
  GGML_ASSERT(nrows % 2 == 0);
2549
4093
  const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2550
4094
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
2551
4095
  const dim3 block_nums(num_blocks_x, nrows, 1);
2552
- rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
4096
+ rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
2553
4097
  }
2554
4098
 
2555
4099
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
@@ -2676,10 +4220,9 @@ static size_t g_scratch_offset = 0;
2676
4220
 
2677
4221
  static int g_device_count = -1;
2678
4222
  static int g_main_device = 0;
2679
- #ifndef GGML_CUDA_FORCE_DMMV
2680
4223
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
2681
- #endif
2682
4224
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
4225
+ static bool g_mul_mat_q = false;
2683
4226
 
2684
4227
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
2685
4228
 
@@ -2701,9 +4244,7 @@ void ggml_init_cublas() {
2701
4244
  g_tensor_split[id] = total_vram;
2702
4245
  total_vram += prop.totalGlobalMem;
2703
4246
 
2704
- #ifndef GGML_CUDA_FORCE_DMMV
2705
4247
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
2706
- #endif
2707
4248
  }
2708
4249
  for (int id = 0; id < g_device_count; ++id) {
2709
4250
  g_tensor_split[id] /= total_vram;
@@ -2965,6 +4506,83 @@ inline void ggml_cuda_op_rms_norm(
2965
4506
  (void) i1;
2966
4507
  }
2967
4508
 
4509
+ inline void ggml_cuda_op_mul_mat_q(
4510
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
4511
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
4512
+ cudaStream_t & cudaStream_main){
4513
+
4514
+ GGML_ASSERT(src0_ddq_i != nullptr);
4515
+ GGML_ASSERT(src1_ddf_i != nullptr);
4516
+ GGML_ASSERT(dst_ddf_i != nullptr);
4517
+
4518
+ const int64_t ne00 = src0->ne[0];
4519
+
4520
+ const int64_t ne10 = src1->ne[0];
4521
+ const int64_t ne11 = src1->ne[1];
4522
+ GGML_ASSERT(ne10 % QK8_1 == 0);
4523
+
4524
+ const int64_t ne0 = dst->ne[0];
4525
+
4526
+ const int64_t i01_diff = i01_high - i01_low;
4527
+
4528
+ int id;
4529
+ CUDA_CHECK(cudaGetDevice(&id));
4530
+
4531
+ // the main device has a larger memory buffer to hold the results from all GPUs
4532
+ // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
4533
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
4534
+
4535
+ const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
4536
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
4537
+ size_t as;
4538
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
4539
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
4540
+
4541
+ switch (src0->type) {
4542
+ case GGML_TYPE_Q4_0:
4543
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4544
+ break;
4545
+ case GGML_TYPE_Q4_1:
4546
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4547
+ break;
4548
+ case GGML_TYPE_Q5_0:
4549
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4550
+ break;
4551
+ case GGML_TYPE_Q5_1:
4552
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4553
+ break;
4554
+ case GGML_TYPE_Q8_0:
4555
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4556
+ break;
4557
+ case GGML_TYPE_Q2_K:
4558
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4559
+ break;
4560
+ case GGML_TYPE_Q3_K:
4561
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4562
+ break;
4563
+ case GGML_TYPE_Q4_K:
4564
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4565
+ break;
4566
+ case GGML_TYPE_Q5_K:
4567
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4568
+ break;
4569
+ case GGML_TYPE_Q6_K:
4570
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4571
+ break;
4572
+ default:
4573
+ GGML_ASSERT(false);
4574
+ break;
4575
+ }
4576
+
4577
+ ggml_cuda_pool_free(src1_q8_1, as);
4578
+
4579
+ (void) src1;
4580
+ (void) dst;
4581
+ (void) src0_ddf_i;
4582
+ (void) i02;
4583
+ (void) i1;
4584
+ }
4585
+
2968
4586
  inline void ggml_cuda_op_mul_mat_vec(
2969
4587
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2970
4588
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2979,6 +4597,7 @@ inline void ggml_cuda_op_mul_mat_vec(
2979
4597
 
2980
4598
  #ifdef GGML_CUDA_FORCE_DMMV
2981
4599
  const bool use_mul_mat_vec_q = false;
4600
+ (void) g_compute_capabilities[0];
2982
4601
  #else
2983
4602
  int id;
2984
4603
  CUDA_CHECK(cudaGetDevice(&id));
@@ -3006,7 +4625,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3006
4625
  ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
3007
4626
  size_t as;
3008
4627
  void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
3009
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
4628
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
3010
4629
 
3011
4630
  switch (src0->type) {
3012
4631
  case GGML_TYPE_Q4_0:
@@ -3047,7 +4666,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3047
4666
  ggml_cuda_pool_free(src1_q8_1, as);
3048
4667
  } else {
3049
4668
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
3050
- #ifdef GGML_CUDA_DMMV_F16
4669
+ #ifdef GGML_CUDA_F16
3051
4670
  size_t ash;
3052
4671
  dfloat * src1_dfloat = nullptr; // dfloat == half
3053
4672
 
@@ -3063,7 +4682,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3063
4682
  }
3064
4683
  #else
3065
4684
  dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
3066
- #endif // GGML_CUDA_DMMV_F16
4685
+ #endif // GGML_CUDA_F16
3067
4686
 
3068
4687
  switch (src0->type) {
3069
4688
  case GGML_TYPE_Q4_0:
@@ -3104,11 +4723,11 @@ inline void ggml_cuda_op_mul_mat_vec(
3104
4723
  break;
3105
4724
  }
3106
4725
 
3107
- #ifdef GGML_CUDA_DMMV_F16
4726
+ #ifdef GGML_CUDA_F16
3108
4727
  if (src1_convert_f16) {
3109
4728
  ggml_cuda_pool_free(src1_dfloat, ash);
3110
4729
  }
3111
- #endif // GGML_CUDA_DMMV_F16
4730
+ #endif // GGML_CUDA_F16
3112
4731
  }
3113
4732
 
3114
4733
  (void) src1;
@@ -3168,6 +4787,7 @@ inline void ggml_cuda_op_rope(
3168
4787
  GGML_ASSERT(dst_ddf_i != nullptr);
3169
4788
 
3170
4789
  const int64_t ne00 = src0->ne[0];
4790
+ const int64_t ne01 = src0->ne[1];
3171
4791
  const int64_t i01_diff = i01_high - i01_low;
3172
4792
 
3173
4793
  const int n_past = ((int32_t *) dst->op_params)[0];
@@ -3181,17 +4801,18 @@ inline void ggml_cuda_op_rope(
3181
4801
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
3182
4802
 
3183
4803
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
3184
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3185
4804
 
3186
- bool is_glm = mode & 4;
4805
+ const bool is_glm = mode & 4;
3187
4806
 
3188
4807
  // compute
3189
4808
  if (is_glm) {
4809
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3190
4810
  const float id_p = min(p, n_ctx - 2.f);
3191
4811
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
3192
4812
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
3193
4813
  } else {
3194
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
4814
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
4815
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
3195
4816
  }
3196
4817
 
3197
4818
  (void) src1;
@@ -3363,7 +4984,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3363
4984
  int64_t row_low, row_high;
3364
4985
  if (split) {
3365
4986
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
3366
- row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
4987
+ row_low -= row_low % GGML_CUDA_MMQ_Y;
4988
+
4989
+ if (id == g_device_count - 1) {
4990
+ row_high = nrows0;
4991
+ } else {
4992
+ row_high = nrows0*g_tensor_split[id + 1];
4993
+ row_high -= row_high % GGML_CUDA_MMQ_Y;
4994
+ }
3367
4995
  } else {
3368
4996
  row_low = 0;
3369
4997
  row_high = nrows0*i02_divisor;
@@ -3529,13 +5157,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3529
5157
  if (split) {
3530
5158
  // src0 = weight matrix is saved as a transposed matrix for better memory layout.
3531
5159
  // dst is NOT transposed.
3532
- // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
5160
+ // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
3533
5161
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
3534
5162
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
3535
- for (int64_t j = 0; j < ne1; ++j) {
3536
- float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
3537
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
3538
- }
5163
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
5164
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
5165
+ i01_diff*sizeof(float), ne1, kind, cudaStream_main));
3539
5166
  } else {
3540
5167
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
3541
5168
  CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
@@ -3718,7 +5345,18 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
3718
5345
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
3719
5346
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
3720
5347
  } else {
3721
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
5348
+ int min_compute_capability = INT_MAX;
5349
+ for (int id = 0; id < g_device_count; ++id) {
5350
+ if (min_compute_capability > g_compute_capabilities[id]) {
5351
+ min_compute_capability = g_compute_capabilities[id];
5352
+ }
5353
+ }
5354
+
5355
+ if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
5356
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
5357
+ } else {
5358
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
5359
+ }
3722
5360
  }
3723
5361
  } else {
3724
5362
  GGML_ASSERT(false);
@@ -3795,7 +5433,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
3795
5433
 
3796
5434
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3797
5435
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3798
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
5436
+
5437
+ const int mode = ((int32_t *) dst->op_params)[2];
5438
+ const bool is_glm = mode & 4;
5439
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
3799
5440
  }
3800
5441
 
3801
5442
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3828,7 +5469,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3828
5469
  row_high = nrows;
3829
5470
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
3830
5471
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
3831
- row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
5472
+ row_low -= row_low % GGML_CUDA_MMQ_Y;
5473
+
5474
+ if (id == g_device_count - 1) {
5475
+ row_high = nrows;
5476
+ } else {
5477
+ row_high = nrows*g_tensor_split[id + 1];
5478
+ row_high -= row_high % GGML_CUDA_MMQ_Y;
5479
+ }
3832
5480
  } else {
3833
5481
  GGML_ASSERT(false);
3834
5482
  }
@@ -4002,6 +5650,10 @@ void ggml_cuda_set_main_device(int main_device) {
4002
5650
  }
4003
5651
  }
4004
5652
 
5653
+ void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
5654
+ g_mul_mat_q = mul_mat_q;
5655
+ }
5656
+
4005
5657
  void ggml_cuda_set_scratch_size(size_t scratch_size) {
4006
5658
  g_scratch_size = scratch_size;
4007
5659
  }