llama_cpp 0.3.5 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,13 +52,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
52
52
  } while (0)
53
53
  #endif // CUDART_VERSION >= 11
54
54
 
55
- #ifdef GGML_CUDA_DMMV_F16
55
+ #ifdef GGML_CUDA_F16
56
56
  typedef half dfloat; // dequantize float
57
57
  typedef half2 dfloat2;
58
58
  #else
59
59
  typedef float dfloat; // dequantize float
60
60
  typedef float2 dfloat2;
61
- #endif //GGML_CUDA_DMMV_F16
61
+ #endif //GGML_CUDA_F16
62
+
63
+ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
64
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
65
+
66
+ int x32 = 0;
67
+ x32 |= x16[0] << 0;
68
+ x32 |= x16[1] << 16;
69
+
70
+ return x32;
71
+ }
72
+
73
+ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
74
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
75
+
76
+ int x32 = 0;
77
+ x32 |= x16[0] << 0;
78
+ x32 |= x16[1] << 16;
79
+
80
+ return x32;
81
+ }
82
+
83
+ static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
84
+ return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
85
+ }
86
+
87
+ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
88
+ return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
89
+ }
62
90
 
63
91
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
64
92
  typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
@@ -87,8 +115,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
87
115
  #define QR4_1 2
88
116
  #define QI4_1 (QK4_1 / (4 * QR4_1))
89
117
  typedef struct {
90
- half d; // delta
91
- half m; // min
118
+ half2 dm; // dm.x = delta, dm.y = min
92
119
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
93
120
  } block_q4_1;
94
121
  static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -107,8 +134,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
107
134
  #define QR5_1 2
108
135
  #define QI5_1 (QK5_1 / (4 * QR5_1))
109
136
  typedef struct {
110
- half d; // delta
111
- half m; // min
137
+ half2 dm; // dm.x = delta, dm.y = min
112
138
  uint8_t qh[4]; // 5-th bit of quants
113
139
  uint8_t qs[QK5_1 / 2]; // nibbles / quants
114
140
  } block_q5_1;
@@ -127,13 +153,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
127
153
  #define QR8_1 1
128
154
  #define QI8_1 (QK8_1 / (4 * QR8_1))
129
155
  typedef struct {
130
- half d; // delta
131
- half s; // unquantized sum
156
+ half2 ds; // ds.x = delta, ds.y = sum
132
157
  int8_t qs[QK8_0]; // quants
133
158
  } block_q8_1;
134
159
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
135
160
 
136
- typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
161
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
162
+ typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
163
+ typedef void (*load_tiles_cuda_t)(
164
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
165
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
166
+ typedef float (*vec_dot_q_mul_mat_cuda_t)(
167
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
168
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
137
169
 
138
170
  //================================= k-quants
139
171
 
@@ -150,8 +182,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
150
182
  typedef struct {
151
183
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
152
184
  uint8_t qs[QK_K/4]; // quants
153
- half d; // super-block scale for quantized scales
154
- half dmin; // super-block scale for quantized mins
185
+ half2 dm; // super-block scale for quantized scales/mins
155
186
  } block_q2_K;
156
187
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
157
188
 
@@ -180,8 +211,7 @@ typedef struct {
180
211
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
181
212
  #else
182
213
  typedef struct {
183
- half d; // super-block scale for quantized scales
184
- half dmin; // super-block scale for quantized mins
214
+ half2 dm; // super-block scale for quantized scales/mins
185
215
  uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
186
216
  uint8_t qs[QK_K/2]; // 4--bit quants
187
217
  } block_q4_K;
@@ -200,11 +230,10 @@ typedef struct {
200
230
  static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
201
231
  #else
202
232
  typedef struct {
203
- half d; // super-block scale for quantized scales
204
- half dmin; // super-block scale for quantized mins
205
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
206
- uint8_t qh[QK_K/8]; // quants, high bit
207
- uint8_t qs[QK_K/2]; // quants, low 4 bits
233
+ half2 dm; // super-block scale for quantized scales/mins
234
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
235
+ uint8_t qh[QK_K/8]; // quants, high bit
236
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
208
237
  } block_q5_K;
209
238
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
210
239
  #endif
@@ -233,6 +262,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
233
262
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
234
263
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
235
264
 
265
+ #ifndef GGML_CUDA_MMQ_Y
266
+ #define GGML_CUDA_MMQ_Y 64
267
+ #endif // GGML_CUDA_MMQ_Y
268
+
236
269
  // dmmv = dequantize_mul_mat_vec
237
270
  #ifndef GGML_CUDA_DMMV_X
238
271
  #define GGML_CUDA_DMMV_X 32
@@ -367,33 +400,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
367
400
  v.x = vui & 0xF;
368
401
  v.y = vui >> 4;
369
402
 
370
- #ifdef GGML_CUDA_DMMV_F16
403
+ #ifdef GGML_CUDA_F16
371
404
  v = __hsub2(v, {8.0f, 8.0f});
372
405
  v = __hmul2(v, {d, d});
373
406
  #else
374
407
  v.x = (v.x - 8.0f) * d;
375
408
  v.y = (v.y - 8.0f) * d;
376
- #endif // GGML_CUDA_DMMV_F16
409
+ #endif // GGML_CUDA_F16
377
410
  }
378
411
 
379
412
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
380
413
  const block_q4_1 * x = (const block_q4_1 *) vx;
381
414
 
382
- const dfloat d = x[ib].d;
383
- const dfloat m = x[ib].m;
415
+ const dfloat d = x[ib].dm.x;
416
+ const dfloat m = x[ib].dm.y;
384
417
 
385
418
  const int vui = x[ib].qs[iqs];
386
419
 
387
420
  v.x = vui & 0xF;
388
421
  v.y = vui >> 4;
389
422
 
390
- #ifdef GGML_CUDA_DMMV_F16
423
+ #ifdef GGML_CUDA_F16
391
424
  v = __hmul2(v, {d, d});
392
425
  v = __hadd2(v, {m, m});
393
426
  #else
394
427
  v.x = (v.x * d) + m;
395
428
  v.y = (v.y * d) + m;
396
- #endif // GGML_CUDA_DMMV_F16
429
+ #endif // GGML_CUDA_F16
397
430
  }
398
431
 
399
432
  static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
@@ -410,20 +443,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
410
443
  v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
411
444
  v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
412
445
 
413
- #ifdef GGML_CUDA_DMMV_F16
446
+ #ifdef GGML_CUDA_F16
414
447
  v = __hsub2(v, {16.0f, 16.0f});
415
448
  v = __hmul2(v, {d, d});
416
449
  #else
417
450
  v.x = (v.x - 16.0f) * d;
418
451
  v.y = (v.y - 16.0f) * d;
419
- #endif // GGML_CUDA_DMMV_F16
452
+ #endif // GGML_CUDA_F16
420
453
  }
421
454
 
422
455
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
423
456
  const block_q5_1 * x = (const block_q5_1 *) vx;
424
457
 
425
- const dfloat d = x[ib].d;
426
- const dfloat m = x[ib].m;
458
+ const dfloat d = x[ib].dm.x;
459
+ const dfloat m = x[ib].dm.y;
427
460
 
428
461
  uint32_t qh;
429
462
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -434,13 +467,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
434
467
  v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
435
468
  v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
436
469
 
437
- #ifdef GGML_CUDA_DMMV_F16
470
+ #ifdef GGML_CUDA_F16
438
471
  v = __hmul2(v, {d, d});
439
472
  v = __hadd2(v, {m, m});
440
473
  #else
441
474
  v.x = (v.x * d) + m;
442
475
  v.y = (v.y * d) + m;
443
- #endif // GGML_CUDA_DMMV_F16
476
+ #endif // GGML_CUDA_F16
444
477
  }
445
478
 
446
479
  static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
@@ -451,12 +484,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
451
484
  v.x = x[ib].qs[iqs + 0];
452
485
  v.y = x[ib].qs[iqs + 1];
453
486
 
454
- #ifdef GGML_CUDA_DMMV_F16
487
+ #ifdef GGML_CUDA_F16
455
488
  v = __hmul2(v, {d, d});
456
489
  #else
457
490
  v.x *= d;
458
491
  v.y *= d;
459
- #endif // GGML_CUDA_DMMV_F16
492
+ #endif // GGML_CUDA_F16
460
493
  }
461
494
 
462
495
  //================================== k-quants
@@ -475,8 +508,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
475
508
  const uint8_t q = x[i].qs[32*n + l];
476
509
  float * y = yy + i*QK_K + 128*n;
477
510
 
478
- float dall = x[i].d;
479
- float dmin = x[i].dmin;
511
+ float dall = x[i].dm.x;
512
+ float dmin = x[i].dm.y;
480
513
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
481
514
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
482
515
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -486,8 +519,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
486
519
  const int il = tid%16; // 0...15
487
520
  const uint8_t q = x[i].qs[il] >> (2*is);
488
521
  float * y = yy + i*QK_K + 16*is + il;
489
- float dall = x[i].d;
490
- float dmin = x[i].dmin;
522
+ float dall = x[i].dm.x;
523
+ float dmin = x[i].dm.y;
491
524
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
492
525
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
493
526
  #endif
@@ -573,8 +606,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
573
606
 
574
607
  float * y = yy + i*QK_K + 64*il + n*ir;
575
608
 
576
- const float dall = x[i].d;
577
- const float dmin = x[i].dmin;
609
+ const float dall = x[i].dm.x;
610
+ const float dmin = x[i].dm.y;
578
611
 
579
612
  const uint8_t * q = x[i].qs + 32*il + n*ir;
580
613
 
@@ -612,8 +645,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
612
645
 
613
646
  float * y = yy + i*QK_K + 64*il + 2*ir;
614
647
 
615
- const float dall = x[i].d;
616
- const float dmin = x[i].dmin;
648
+ const float dall = x[i].dm.x;
649
+ const float dmin = x[i].dm.y;
617
650
 
618
651
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
619
652
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -725,8 +758,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
725
758
  const float * y = yy + i * QK_K + y_offset;
726
759
  const uint8_t * q = x[i].qs + q_offset;
727
760
 
728
- const float dall = x[i].d;
729
- const float dmin = x[i].dmin;
761
+ const float dall = x[i].dm.x;
762
+ const float dmin = x[i].dm.y;
730
763
 
731
764
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
732
765
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -768,9 +801,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
768
801
  uaux[0] = s[0] & 0x0f0f0f0f;
769
802
  uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
770
803
 
771
- const half2 * dh = (const half2 *)&x[i].d;
772
-
773
- const float2 dall = __half22float2(dh[0]);
804
+ const float2 dall = __half22float2(x[i].dm);
774
805
 
775
806
  float sum1 = 0, sum2 = 0;
776
807
  for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
@@ -948,8 +979,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
948
979
  const float * y1 = yy + i*QK_K + y_offset;
949
980
  const float * y2 = y1 + 128;
950
981
 
951
- const float dall = x[i].d;
952
- const float dmin = x[i].dmin;
982
+ const float dall = x[i].dm.x;
983
+ const float dmin = x[i].dm.y;
953
984
 
954
985
  const uint16_t * a = (const uint16_t *)x[i].scales;
955
986
  aux[0] = a[im+0] & kmask1;
@@ -1081,8 +1112,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1081
1112
  const float * y1 = yy + i*QK_K + y_offset;
1082
1113
  const float * y2 = y1 + 128;
1083
1114
 
1084
- const float dall = x[i].d;
1085
- const float dmin = x[i].dmin;
1115
+ const float dall = x[i].dm.x;
1116
+ const float dmin = x[i].dm.y;
1086
1117
 
1087
1118
  const uint16_t * a = (const uint16_t *)x[i].scales;
1088
1119
  aux[0] = a[im+0] & kmask1;
@@ -1270,19 +1301,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1270
1301
  v.y = x[ib + iqs + 1];
1271
1302
  }
1272
1303
 
1273
- static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1274
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
1304
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1305
+ const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1275
1306
 
1276
- if (i >= k) {
1307
+ if (ix >= kx_padded) {
1277
1308
  return;
1278
1309
  }
1279
1310
 
1311
+ const int iy = blockDim.y*blockIdx.y + threadIdx.y;
1312
+
1313
+ const int i_padded = iy*kx_padded + ix;
1314
+
1280
1315
  block_q8_1 * y = (block_q8_1 *) vy;
1281
1316
 
1282
- const int ib = i / QK8_1; // block index
1283
- const int iqs = i % QK8_1; // quant index
1317
+ const int ib = i_padded / QK8_1; // block index
1318
+ const int iqs = i_padded % QK8_1; // quant index
1284
1319
 
1285
- const float xi = i < ndata ? x[i] : 0.0f;
1320
+ const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
1286
1321
  float amax = fabsf(xi);
1287
1322
  float sum = xi;
1288
1323
 
@@ -1301,8 +1336,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1301
1336
  return;
1302
1337
  }
1303
1338
 
1304
- y[ib].d = d;
1305
- y[ib].s = sum;
1339
+ y[ib].ds.x = d;
1340
+ y[ib].ds.y = sum;
1306
1341
  }
1307
1342
 
1308
1343
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -1326,485 +1361,1816 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
1326
1361
  y[iybs + iqs + y_offset] = v.y;
1327
1362
  }
1328
1363
 
1329
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1330
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1331
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1332
- const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1364
+ // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
1365
+ // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
1333
1366
 
1334
- int vi;
1335
- memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1336
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1337
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1367
+ #define VDR_Q4_0_Q8_1_MMVQ 2
1368
+ #define VDR_Q4_0_Q8_1_MMQ 4
1338
1369
 
1339
- const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
1370
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
1371
+ const int * v, const int * u, const float & d4, const half2 & ds8) {
1372
+
1373
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1374
+ int sumi = 0;
1340
1375
 
1341
- // subtract 8 from each quantized value
1342
- const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
1343
- const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
1376
+ #pragma unroll
1377
+ for (int i = 0; i < vdr; ++i) {
1378
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
1379
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
1344
1380
 
1345
- // SIMD dot product of quantized values
1346
- int sumi = __dp4a(vi0, ui0, 0);
1347
- sumi = __dp4a(vi1, ui1, sumi);
1381
+ // SIMD dot product of quantized values
1382
+ sumi = __dp4a(vi0, u[2*i+0], sumi);
1383
+ sumi = __dp4a(vi1, u[2*i+1], sumi);
1384
+ }
1348
1385
 
1349
- return sumi*d;
1386
+ // second part effectively subtracts 8 from each quant value
1387
+ return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y));
1350
1388
  #else
1351
1389
  return 0.0f; // only to satisfy the compiler
1352
1390
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1353
1391
  }
1354
1392
 
1355
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1356
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1357
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1358
- const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1393
+ #define VDR_Q4_1_Q8_1_MMVQ 2
1394
+ #define VDR_Q4_1_Q8_1_MMQ 4
1395
+
1396
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
1397
+ const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
1359
1398
 
1360
- const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1361
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1362
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
1399
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1400
+ int sumi = 0;
1363
1401
 
1364
- const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
1365
- const float m = bq4_1->m;
1366
- const float s = bq8_1->s;
1402
+ #pragma unroll
1403
+ for (int i = 0; i < vdr; ++i) {
1404
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
1405
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
1367
1406
 
1368
- const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1369
- const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1407
+ // SIMD dot product of quantized values
1408
+ sumi = __dp4a(vi0, u[2*i+0], sumi);
1409
+ sumi = __dp4a(vi1, u[2*i+1], sumi);
1410
+ }
1370
1411
 
1371
- // SIMD dot product of quantized values
1372
- int sumi = __dp4a(vi0, ui0, 0);
1373
- sumi = __dp4a(vi1, ui1, sumi);
1412
+ #ifdef GGML_CUDA_F16
1413
+ const half2 tmp = __hmul2(dm4, ds8);
1414
+ const float d4d8 = __half2float(tmp.x);
1415
+ const float m4s8 = __half2float(tmp.y);
1416
+ #else
1417
+ const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
1418
+ const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
1419
+ #endif // GGML_CUDA_F16
1374
1420
 
1375
- return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1421
+ // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1422
+ return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1376
1423
  #else
1377
1424
  return 0.0f; // only to satisfy the compiler
1378
1425
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1379
1426
  }
1380
1427
 
1381
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1382
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1428
+ #define VDR_Q5_0_Q8_1_MMVQ 2
1429
+ #define VDR_Q5_0_Q8_1_MMQ 4
1430
+
1431
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
1432
+ const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
1433
+
1383
1434
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1384
- const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1435
+ int sumi = 0;
1436
+
1437
+ for (int i = 0; i < vdr; ++i) {
1438
+ int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1439
+ vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
1440
+ vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
1441
+ vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
1442
+ vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
1443
+ sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
1444
+
1445
+ int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
1446
+ vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
1447
+ vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
1448
+ vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
1449
+ vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
1450
+ sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1451
+ }
1385
1452
 
1386
- int qs;
1387
- memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1388
- const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1389
- const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1390
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1391
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
1392
-
1393
- const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
1394
-
1395
- int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1396
- vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1397
- vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1398
- vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1399
- vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1400
- vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
1401
- int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1402
-
1403
- int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1404
- vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1405
- vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1406
- vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1407
- vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1408
- vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
1409
- sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1410
-
1411
- return sumi*d;
1453
+ // second part effectively subtracts 16 from each quant value
1454
+ return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y));
1412
1455
  #else
1413
1456
  return 0.0f; // only to satisfy the compiler
1414
1457
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1415
1458
  }
1416
1459
 
1417
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1418
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1460
+ #define VDR_Q5_1_Q8_1_MMVQ 2
1461
+ #define VDR_Q5_1_Q8_1_MMQ 4
1462
+
1463
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
1464
+ const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
1465
+
1419
1466
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1420
- const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1467
+ int sumi = 0;
1468
+
1469
+ for (int i = 0; i < vdr; ++i) {
1470
+ int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1471
+ vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
1472
+ vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
1473
+ vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
1474
+ vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
1475
+ sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
1476
+
1477
+ int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
1478
+ vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
1479
+ vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
1480
+ vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
1481
+ vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
1482
+ sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1483
+ }
1484
+
1485
+ #ifdef GGML_CUDA_F16
1486
+ const half2 tmp = __hmul2(dm5, ds8);
1487
+ const float d5d8 = __half2float(tmp.x);
1488
+ const float m5s8 = __half2float(tmp.y);
1489
+ #else
1490
+ const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
1491
+ const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
1492
+ #endif // GGML_CUDA_F16
1493
+
1494
+ // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
1495
+ return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1421
1496
 
1422
- const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
1423
- const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
1424
- const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
1425
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1426
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
1427
-
1428
- const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
1429
- const float m = bq5_1->m;
1430
- const float s = bq8_1->s;
1431
-
1432
- int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1433
- vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1434
- vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1435
- vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1436
- vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1437
- int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1438
-
1439
- int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1440
- vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1441
- vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1442
- vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1443
- vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1444
- sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1445
-
1446
- return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1447
1497
  #else
1448
1498
  return 0.0f; // only to satisfy the compiler
1449
1499
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1450
1500
  }
1451
1501
 
1452
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1453
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1454
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1455
- const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1502
+ #define VDR_Q8_0_Q8_1_MMVQ 2
1503
+ #define VDR_Q8_0_Q8_1_MMQ 8
1456
1504
 
1457
- int vi;
1458
- memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1459
- const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1505
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
1506
+ const int * v, const int * u, const float & d8_0, const half2 & ds8_1) {
1460
1507
 
1461
- const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
1508
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1509
+ int sumi = 0;
1462
1510
 
1463
- // SIMD dot product of quantized values
1464
- int sumi = __dp4a(vi, ui, 0);
1511
+ for (int i = 0; i < vdr; ++i) {
1512
+ // SIMD dot product of quantized values
1513
+ sumi = __dp4a(v[i], u[i], sumi);
1514
+ }
1465
1515
 
1466
- return sumi*d;
1516
+ return sumi * d8_0 * __half2float(ds8_1.x);
1467
1517
  #else
1468
1518
  return 0.0f; // only to satisfy the compiler
1469
1519
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1470
1520
  }
1471
1521
 
1472
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1473
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1522
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
1523
+ const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
1474
1524
 
1475
1525
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1476
- const block_q2_K * bq2_K = (const block_q2_K *) vbq;
1526
+ int sumi = 0;
1477
1527
 
1478
- const int bq8_offset = QR2_K * (iqs / QI8_1);
1479
- const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1528
+ for (int i = 0; i < vdr; ++i) {
1529
+ // SIMD dot product of quantized values
1530
+ sumi = __dp4a(v[i], u[i], sumi);
1531
+ }
1480
1532
 
1481
- float sumf_d = 0.0f;
1482
- float sumf_m = 0.0f;
1533
+ #ifdef GGML_CUDA_F16
1534
+ const half2 tmp = __hmul2(dm8, ds8);
1535
+ const float d8d8 = __half2float(tmp.x);
1536
+ const float m8s8 = __half2float(tmp.y);
1537
+ #else
1538
+ const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x);
1539
+ const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y);
1540
+ #endif // GGML_CUDA_F16
1483
1541
 
1484
- const float d = bq2_K->d;
1485
- const float dmin = bq2_K->dmin;
1542
+ // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1543
+ return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1544
+ #else
1545
+ return 0.0f; // only to satisfy the compiler
1546
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1547
+ }
1486
1548
 
1487
- const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
1549
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1550
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1488
1551
 
1489
- for (int i = 0; i < QR2_K; ++i) {
1490
- const int sc = bq2_K->scales[scale_offset + 2*i];
1552
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1491
1553
 
1492
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1493
- const float d8i = bq8i->d;
1554
+ int v[VDR_Q4_0_Q8_1_MMVQ];
1555
+ int u[2*VDR_Q4_0_Q8_1_MMVQ];
1494
1556
 
1495
- const int vi = (v >> (2*i)) & 0x03030303;
1496
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1557
+ #pragma unroll
1558
+ for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
1559
+ v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
1560
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1561
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
1562
+ }
1563
+
1564
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
1565
+ }
1566
+
1567
+ static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1568
+
1569
+ __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
1570
+ __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
1571
+
1572
+ *x_ql = tile_x_qs;
1573
+ *x_dm = (half2 *) tile_x_d;
1574
+ }
1575
+
1576
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1577
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1578
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1579
+
1580
+ __builtin_assume(i_offset >= 0);
1581
+ __builtin_assume(i_offset < 8);
1582
+ __builtin_assume(k >= 0);
1583
+ __builtin_assume(k < WARP_SIZE);
1584
+
1585
+ const int kbx = k / QI4_0;
1586
+ const int kqsx = k % QI4_0;
1587
+
1588
+ const block_q4_0 * bx0 = (block_q4_0 *) vx;
1589
+
1590
+ float * x_dmf = (float *) x_dm;
1591
+
1592
+ #pragma unroll
1593
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1594
+ int i = i0 + i_offset;
1497
1595
 
1498
- sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
1499
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
1596
+ if (need_check) {
1597
+ i = min(i, i_max);
1598
+ }
1599
+
1600
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
1601
+
1602
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
1603
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1500
1604
  }
1501
1605
 
1502
- return d*sumf_d - dmin*sumf_m;
1503
- #else
1504
- return 0.0f; // only to satisfy the compiler
1505
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1606
+ // const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1607
+ // const int kbxd = k % blocks_per_tile_x_row;
1608
+
1609
+ // #pragma unroll
1610
+ // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
1611
+ // FIXME out-of-bounds
1612
+ // const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1613
+
1614
+ // if (i >= GGML_CUDA_MMQ_Y) {
1615
+ // return;
1616
+ // }
1617
+
1618
+ // const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1619
+
1620
+ // x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
1621
+ // }
1506
1622
  }
1507
1623
 
1508
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1509
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1624
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1625
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1626
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1510
1627
 
1511
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1512
- const block_q3_K * bq3_K = (const block_q3_K *) vbq;
1628
+ __builtin_assume(i >= 0);
1629
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1630
+ __builtin_assume(j >= 0);
1631
+ __builtin_assume(j < WARP_SIZE);
1632
+ __builtin_assume(k >= 0);
1633
+ __builtin_assume(k < WARP_SIZE);
1513
1634
 
1514
- const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
1515
- const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1635
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1636
+ const float * x_dmf = (float *) x_dm;
1516
1637
 
1517
- float sumf = 0.0f;
1638
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
1518
1639
 
1519
- const float d = bq3_K->d;
1640
+ #pragma unroll
1641
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
1642
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1643
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
1644
+ }
1520
1645
 
1521
- int vl;
1522
- memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
1646
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
1647
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
1648
+ y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1649
+ }
1523
1650
 
1524
- int vh;
1525
- memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
1526
- vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
1527
- vh >>= bq8_offset;
1651
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1652
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1528
1653
 
1529
- for (int i = 0; i < QR3_K; ++i) {
1530
- const int isc = scale_offset + 2*i;
1654
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1531
1655
 
1532
- const int isc_low = isc % (QK_K/32);
1533
- const int sc_shift_low = 4 * (isc / (QK_K/32));
1534
- const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
1656
+ int v[VDR_Q4_1_Q8_1_MMVQ];
1657
+ int u[2*VDR_Q4_1_Q8_1_MMVQ];
1535
1658
 
1536
- const int isc_high = isc % (QK_K/64);
1537
- const int sc_shift_high = 2 * (isc / (QK_K/64));
1538
- const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1659
+ #pragma unroll
1660
+ for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
1661
+ v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
1662
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1663
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
1664
+ }
1539
1665
 
1540
- const int sc = (sc_low | sc_high) - 32;
1666
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
1667
+ }
1541
1668
 
1542
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1543
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1544
- const float d8i = bq8i->d;
1669
+ static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1545
1670
 
1546
- const int vil = (vl >> (2*i)) & 0x03030303;
1671
+ __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
1672
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
1547
1673
 
1548
- const int vih = ((vh >> i) << 2) & 0x04040404;
1674
+ *x_ql = tile_x_qs;
1675
+ *x_dm = tile_x_dm;
1676
+ }
1549
1677
 
1550
- const int vi = __vsubss4(vil, vih);
1678
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
1679
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1680
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1681
+
1682
+ __builtin_assume(i_offset >= 0);
1683
+ __builtin_assume(i_offset < 8);
1684
+ __builtin_assume(k >= 0);
1685
+ __builtin_assume(k < WARP_SIZE);
1686
+
1687
+ const int kbx = k / QI4_1;
1688
+ const int kqsx = k % QI4_1;
1689
+
1690
+ const block_q4_1 * bx0 = (block_q4_1 *) vx;
1551
1691
 
1552
- sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1692
+ #pragma unroll
1693
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1694
+ int i = i0 + i_offset;
1695
+
1696
+ if (need_check) {
1697
+ i = min(i, i_max);
1698
+ }
1699
+
1700
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
1701
+
1702
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
1553
1703
  }
1554
1704
 
1555
- return d*sumf;
1556
- #else
1557
- return 0.0f; // only to satisfy the compiler
1558
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1705
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
1706
+ const int kbxd = k % blocks_per_tile_x_row;
1707
+
1708
+ #pragma unroll
1709
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
1710
+ int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
1711
+
1712
+ if (need_check) {
1713
+ i = min(i, i_max);
1714
+ }
1715
+
1716
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
1717
+
1718
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
1719
+ }
1559
1720
  }
1560
1721
 
1561
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1562
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1722
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
1723
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1724
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1563
1725
 
1564
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1565
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1726
+ __builtin_assume(i >= 0);
1727
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1728
+ __builtin_assume(j >= 0);
1729
+ __builtin_assume(j < WARP_SIZE);
1730
+ __builtin_assume(k >= 0);
1731
+ __builtin_assume(k < WARP_SIZE);
1566
1732
 
1567
- float sumf_d = 0.0f;
1568
- float sumf_m = 0.0f;
1733
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1569
1734
 
1570
- #ifndef GGML_QKK_64
1735
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
1571
1736
 
1572
- // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1573
- const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
1737
+ #pragma unroll
1738
+ for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
1739
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1740
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
1741
+ }
1574
1742
 
1575
- const float d = bq4_K->d;
1576
- const float dmin = bq4_K->dmin;
1743
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
1744
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
1745
+ y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1746
+ }
1577
1747
 
1578
- // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
1579
- // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
1580
- // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
1581
- // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
1748
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1749
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1582
1750
 
1583
- const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1584
- const int v1 = q4[0];
1585
- const int v2 = q4[4];
1751
+ const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1586
1752
 
1587
- const uint16_t * scales = (const uint16_t *)bq4_K->scales;
1588
- uint16_t aux[2];
1589
- const int j = bq8_offset/2;
1590
- if (j < 2) {
1591
- aux[0] = scales[j+0] & 0x3f3f;
1592
- aux[1] = scales[j+2] & 0x3f3f;
1593
- } else {
1594
- aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1595
- aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1753
+ int vl[VDR_Q5_0_Q8_1_MMVQ];
1754
+ int vh[VDR_Q5_0_Q8_1_MMVQ];
1755
+ int u[2*VDR_Q5_0_Q8_1_MMVQ];
1756
+
1757
+ #pragma unroll
1758
+ for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
1759
+ vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
1760
+ vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
1761
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1762
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
1596
1763
  }
1597
- const uint8_t * sc = (const uint8_t *)aux;
1598
- const uint8_t * m = sc + 2;
1599
1764
 
1600
- for (int i = 0; i < QR4_K; ++i) {
1765
+ return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
1766
+ }
1767
+
1768
+ static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1769
+
1770
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1771
+ __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
1772
+
1773
+ *x_ql = tile_x_ql;
1774
+ *x_dm = (half2 *) tile_x_d;
1775
+ }
1776
+
1777
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
1778
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1779
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1780
+
1781
+ __builtin_assume(i_offset >= 0);
1782
+ __builtin_assume(i_offset < 8);
1783
+ __builtin_assume(k >= 0);
1784
+ __builtin_assume(k < WARP_SIZE);
1785
+
1786
+ const int kbx = k / QI5_0;
1787
+ const int kqsx = k % QI5_0;
1788
+
1789
+ const block_q5_0 * bx0 = (block_q5_0 *) vx;
1790
+
1791
+ #pragma unroll
1792
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1793
+ int i = i0 + i_offset;
1794
+
1795
+ if (need_check) {
1796
+ i = min(i, i_max);
1797
+ }
1798
+
1799
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
1800
+
1801
+ const int ql = get_int_from_uint8(bxi->qs, kqsx);
1802
+ const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
1803
+
1804
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
1805
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
1806
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
1807
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
1808
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
1809
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
1810
+
1811
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
1812
+
1813
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
1814
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
1815
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
1816
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
1817
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
1818
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
1819
+
1820
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
1821
+ }
1822
+
1823
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
1824
+ const int kbxd = k % blocks_per_tile_x_row;
1825
+ float * x_dmf = (float *) x_dm;
1826
+
1827
+ #pragma unroll
1828
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
1829
+ int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
1830
+
1831
+ if (need_check) {
1832
+ i = min(i, i_max);
1833
+ }
1834
+
1835
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1836
+
1837
+ x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
1838
+ }
1839
+ }
1840
+
1841
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
1842
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1843
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1844
+
1845
+ __builtin_assume(i >= 0);
1846
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1847
+ __builtin_assume(j >= 0);
1848
+ __builtin_assume(j < WARP_SIZE);
1849
+ __builtin_assume(k >= 0);
1850
+ __builtin_assume(k < WARP_SIZE);
1851
+
1852
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1853
+ const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
1854
+ const float * x_dmf = (float *) x_dm;
1855
+
1856
+ int u[2*VDR_Q5_0_Q8_1_MMQ];
1857
+
1858
+ #pragma unroll
1859
+ for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
1860
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1861
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
1862
+ }
1863
+
1864
+ return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
1865
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1866
+ }
1867
+
1868
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1869
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1870
+
1871
+ const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1872
+
1873
+ int vl[VDR_Q5_1_Q8_1_MMVQ];
1874
+ int vh[VDR_Q5_1_Q8_1_MMVQ];
1875
+ int u[2*VDR_Q5_1_Q8_1_MMVQ];
1876
+
1877
+ #pragma unroll
1878
+ for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
1879
+ vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
1880
+ vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
1881
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1882
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
1883
+ }
1884
+
1885
+ return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
1886
+ }
1887
+
1888
+ static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1889
+
1890
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1891
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
1892
+
1893
+ *x_ql = tile_x_ql;
1894
+ *x_dm = tile_x_dm;
1895
+ }
1896
+
1897
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
1898
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1899
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1900
+
1901
+ __builtin_assume(i_offset >= 0);
1902
+ __builtin_assume(i_offset < 8);
1903
+ __builtin_assume(k >= 0);
1904
+ __builtin_assume(k < WARP_SIZE);
1905
+
1906
+ const int kbx = k / QI5_1;
1907
+ const int kqsx = k % QI5_1;
1908
+
1909
+ const block_q5_1 * bx0 = (block_q5_1 *) vx;
1910
+
1911
+ #pragma unroll
1912
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1913
+ int i = i0 + i_offset;
1914
+
1915
+ if (need_check) {
1916
+ i = min(i, i_max);
1917
+ }
1918
+
1919
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
1920
+
1921
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
1922
+ const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
1923
+
1924
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
1925
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
1926
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
1927
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
1928
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
1929
+
1930
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
1931
+
1932
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
1933
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
1934
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
1935
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
1936
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
1937
+
1938
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
1939
+ }
1940
+
1941
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
1942
+ const int kbxd = k % blocks_per_tile_x_row;
1943
+
1944
+ #pragma unroll
1945
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
1946
+ int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
1947
+
1948
+ if (need_check) {
1949
+ i = min(i, i_max);
1950
+ }
1951
+
1952
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
1953
+
1954
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
1955
+ }
1956
+ }
1957
+
1958
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1959
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1960
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1961
+
1962
+ __builtin_assume(i >= 0);
1963
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
1964
+ __builtin_assume(j >= 0);
1965
+ __builtin_assume(j < WARP_SIZE);
1966
+ __builtin_assume(k >= 0);
1967
+ __builtin_assume(k < WARP_SIZE);
1968
+
1969
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1970
+ const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
1971
+
1972
+ int u[2*VDR_Q5_1_Q8_1_MMQ];
1973
+
1974
+ #pragma unroll
1975
+ for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
1976
+ u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1977
+ u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
1978
+ }
1979
+
1980
+ return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
1981
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1982
+ }
1983
+
1984
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1985
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1986
+
1987
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1988
+
1989
+ int v[VDR_Q8_0_Q8_1_MMVQ];
1990
+ int u[VDR_Q8_0_Q8_1_MMVQ];
1991
+
1992
+ for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
1993
+ v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
1994
+ u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1995
+ }
1996
+
1997
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
1998
+ }
1999
+
2000
+ static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2001
+
2002
+ __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2003
+ __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
2004
+
2005
+ *x_ql = tile_x_qs;
2006
+ *x_dm = (half2 *) tile_x_d;
2007
+ }
2008
+
2009
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2010
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2011
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2012
+
2013
+ __builtin_assume(i_offset >= 0);
2014
+ __builtin_assume(i_offset < 8);
2015
+ __builtin_assume(k >= 0);
2016
+ __builtin_assume(k < WARP_SIZE);
2017
+
2018
+ const int kbx = k / QI8_0;
2019
+ const int kqsx = k % QI8_0;
2020
+ float * x_dmf = (float *) x_dm;
2021
+
2022
+ const block_q8_0 * bx0 = (block_q8_0 *) vx;
2023
+
2024
+ #pragma unroll
2025
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2026
+ int i = i0 + i_offset;
2027
+
2028
+ if (need_check) {
2029
+ i = min(i, i_max);
2030
+ }
2031
+
2032
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
2033
+
2034
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
2035
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
2036
+ }
2037
+
2038
+ // const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2039
+ // const int kbxd = k % blocks_per_tile_x_row;
2040
+
2041
+ // #pragma unroll
2042
+ // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
2043
+ // FIXME out-of-bounds
2044
+ // const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2045
+
2046
+ // #if GGML_CUDA_MMQ_Y < 64
2047
+ // if (i >= GGML_CUDA_MMQ_Y) {
2048
+ // return;
2049
+ // }
2050
+ // #endif // GGML_CUDA_MMQ_Y < 64
2051
+
2052
+ // const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2053
+
2054
+ // x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
2055
+ // }
2056
+ }
2057
+
2058
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2059
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2060
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2061
+
2062
+ __builtin_assume(i >= 0);
2063
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2064
+ __builtin_assume(j >= 0);
2065
+ __builtin_assume(j < WARP_SIZE);
2066
+ __builtin_assume(k >= 0);
2067
+ __builtin_assume(k < WARP_SIZE);
2068
+
2069
+ const float * x_dmf = (float *) x_dm;
2070
+
2071
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
2072
+ (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
2073
+ y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2074
+ }
2075
+
2076
+ #define VDR_q2_K_q8_1 1
2077
+
2078
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
2079
+ const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2080
+ const half2 & dm, const float * __restrict__ d8) {
2081
+
2082
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2083
+ float sumf_d = 0.0f;
2084
+ float sumf_m = 0.0f;
2085
+
2086
+ for (int i = 0; i < QR2_K; ++i) {
2087
+ const int sc = scales[2*i];
2088
+
2089
+ const int vi = (v >> (2*i)) & 0x03030303;
2090
+
2091
+ sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
2092
+
2093
+ int sc_high = sc >> 4;
2094
+ sc_high |= sc_high << 8;
2095
+ sc_high |= sc_high << 16;
2096
+ sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
2097
+ }
2098
+
2099
+ const float2 dmf = __half22float2(dm);
2100
+
2101
+ return dmf.x*sumf_d - dmf.y*sumf_m;
2102
+ #else
2103
+ return 0.0f; // only to satisfy the compiler
2104
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2105
+ }
2106
+
2107
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2108
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2109
+
2110
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
2111
+
2112
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
2113
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2114
+
2115
+ const uint8_t * scales = bq2_K->scales + scale_offset;
2116
+
2117
+ const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
2118
+ int u[QR2_K];
2119
+ float d8[QR2_K];
2120
+
2121
+ for (int i = 0; i < QR2_K; ++ i) {
2122
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2123
+ d8[i] = bq8_1[bq8_offset + i].ds.x;
2124
+ }
2125
+
2126
+ return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8);
2127
+ }
2128
+
2129
+ static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2130
+
2131
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2132
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
2133
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2134
+
2135
+ *x_ql = tile_x_ql;
2136
+ *x_dm = tile_x_dm;
2137
+ *x_sc = tile_x_sc;
2138
+ }
2139
+
2140
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2141
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2142
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2143
+
2144
+ __builtin_assume(i_offset >= 0);
2145
+ __builtin_assume(i_offset < 8);
2146
+ __builtin_assume(k >= 0);
2147
+ __builtin_assume(k < WARP_SIZE);
2148
+
2149
+ const int kbx = k / QI2_K;
2150
+ const int kqsx = k % QI2_K;
2151
+
2152
+ const block_q2_K * bx0 = (block_q2_K *) vx;
2153
+
2154
+ #pragma unroll
2155
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2156
+ int i = i0 + i_offset;
2157
+
2158
+ if (need_check) {
2159
+ i = min(i, i_max);
2160
+ }
2161
+
2162
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
2163
+
2164
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2165
+ }
2166
+
2167
+ const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
2168
+ const int kbxd = k % blocks_per_tile_x_row;
2169
+
2170
+ #pragma unroll
2171
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
2172
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2173
+
2174
+ if (need_check) {
2175
+ i = min(i, i_max);
2176
+ }
2177
+
2178
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
2179
+
2180
+ x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
2181
+ }
2182
+
2183
+ #pragma unroll
2184
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2185
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2186
+
2187
+ if (need_check) {
2188
+ i = min(i, i_max);
2189
+ }
2190
+
2191
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
2192
+
2193
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
2194
+ }
2195
+ }
2196
+
2197
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2198
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2199
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2200
+
2201
+ __builtin_assume(i >= 0);
2202
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2203
+ __builtin_assume(j >= 0);
2204
+ __builtin_assume(j < WARP_SIZE);
2205
+ __builtin_assume(k >= 0);
2206
+ __builtin_assume(k < WARP_SIZE);
2207
+
2208
+ const int kbx = k / QI2_K;
2209
+ const int kqsx = k % QI2_K;
2210
+
2211
+ const int bq8_offset = QR2_K * (kqsx / QI8_1);
2212
+ const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2213
+
2214
+ const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset;
2215
+
2216
+ int u[QR2_K];
2217
+ float d8[QR2_K];
2218
+
2219
+ for (int l = 0; l < QR2_K; ++ l) {
2220
+ const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2221
+ u[l] = y_qs[y_qs_index];
2222
+ d8[l] = y_ds[y_qs_index / QI8_1].x;
2223
+ }
2224
+
2225
+ return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8);
2226
+ }
2227
+
2228
+ #define VDR_q3_K_q8_1 1
2229
+
2230
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
2231
+ const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2232
+ const int & scale_offset, const float & d, const float * __restrict__ d8) {
2233
+
2234
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2235
+ float sumf = 0.0f;
2236
+
2237
+ for (int i = 0; i < QR3_K; ++i) {
2238
+ const int isc = scale_offset + 2*i;
2239
+
2240
+ const int isc_low = isc % (QK_K/32);
2241
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
2242
+ const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
2243
+
2244
+ const int isc_high = isc % (QK_K/64);
2245
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
2246
+ const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
2247
+
2248
+ const int sc = (sc_low | sc_high) - 32;
2249
+
2250
+ const int vil = (vl >> (2*i)) & 0x03030303;
2251
+
2252
+ const int vih = ((vh >> i) << 2) & 0x04040404;
2253
+
2254
+ const int vi = __vsubss4(vil, vih);
2255
+
2256
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2257
+ }
2258
+
2259
+ return d*sumf;
2260
+ #else
2261
+ return 0.0f; // only to satisfy the compiler
2262
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2263
+ }
2264
+
2265
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2266
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2267
+
2268
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
2269
+
2270
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
2271
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2272
+
2273
+ const float d = bq3_K->d;
2274
+
2275
+ const int vl = get_int_from_uint8(bq3_K->qs, iqs);
2276
+
2277
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2278
+ const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
2279
+
2280
+ int u[QR3_K];
2281
+ float d8[QR3_K];
2282
+
2283
+ for (int i = 0; i < QR3_K; ++i) {
2284
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2285
+ d8[i] = bq8_1[bq8_offset + i].ds.x;
2286
+ }
2287
+
2288
+ return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2289
+ }
2290
+
2291
+ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2292
+
2293
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2294
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
2295
+ __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2296
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2297
+
2298
+ *x_ql = tile_x_ql;
2299
+ *x_dm = tile_x_dm;
2300
+ *x_qh = tile_x_qh;
2301
+ *x_sc = tile_x_sc;
2302
+ }
2303
+
2304
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2305
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2306
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2307
+
2308
+ __builtin_assume(i_offset >= 0);
2309
+ __builtin_assume(i_offset < 8);
2310
+ __builtin_assume(k >= 0);
2311
+ __builtin_assume(k < WARP_SIZE);
2312
+
2313
+ const int kbx = k / QI3_K;
2314
+ const int kqsx = k % QI3_K;
2315
+
2316
+ const block_q3_K * bx0 = (block_q3_K *) vx;
2317
+
2318
+ #pragma unroll
2319
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2320
+ int i = i0 + i_offset;
2321
+
2322
+ if (need_check) {
2323
+ i = min(i, i_max);
2324
+ }
2325
+
2326
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
2327
+
2328
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
2329
+ }
2330
+
2331
+ const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
2332
+ const int kbxd = k % blocks_per_tile_x_row;
2333
+
2334
+ #pragma unroll
2335
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
2336
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2337
+
2338
+ if (need_check) {
2339
+ i = min(i, i_max);
2340
+ }
2341
+
2342
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
2343
+
2344
+ x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d;
2345
+ }
2346
+
2347
+ #pragma unroll
2348
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
2349
+ int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
2350
+
2351
+ if (need_check) {
2352
+ i = min(i, i_max);
2353
+ }
2354
+
2355
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
2356
+
2357
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2358
+ }
2359
+
2360
+ #pragma unroll
2361
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2362
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2363
+
2364
+ if (need_check) {
2365
+ i = min(i, i_max);
2366
+ }
2367
+
2368
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
2369
+
2370
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4));
2371
+ }
2372
+ }
2373
+
2374
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2375
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2376
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2377
+
2378
+ __builtin_assume(i >= 0);
2379
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2380
+ __builtin_assume(j >= 0);
2381
+ __builtin_assume(j < WARP_SIZE);
2382
+ __builtin_assume(k >= 0);
2383
+ __builtin_assume(k < WARP_SIZE);
2384
+
2385
+ const int kbx = k / QI3_K;
2386
+ const int kqsx = k % QI3_K;
2387
+
2388
+ const int bq8_offset = QR3_K * (kqsx / (QI3_K/2));
2389
+ const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2390
+
2391
+ const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
2392
+
2393
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2394
+ const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
2395
+
2396
+ int u[QR3_K];
2397
+ float d8[QR3_K];
2398
+
2399
+ for (int l = 0; l < QR3_K; ++ l) {
2400
+ const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2401
+ u[l] = y_qs[y_qs_index];
2402
+ d8[l] = y_ds[y_qs_index / QI8_1].x;
2403
+ }
2404
+
2405
+ return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
2406
+ x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
2407
+ }
2408
+
2409
+ #define VDR_q4_K_q8_1 2
2410
+
2411
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
2412
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2413
+ const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
2414
+
2415
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2416
+ float sumf_d = 0.0f;
2417
+ float sumf_m = 0.0f;
2418
+
2419
+ for (int i = 0; i < QR4_K; ++i) {
2420
+ const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
2421
+ const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
2422
+
2423
+ const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
2424
+ const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
2425
+
2426
+ sumf_d += d8[i] * (dot1 * sc[i]);
2427
+ sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
2428
+ }
2429
+
2430
+ return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m;
2431
+
2432
+ #else
2433
+ return 0.0f; // only to satisfy the compiler
2434
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2435
+ }
2436
+
2437
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2438
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2439
+
2440
+ #ifndef GGML_QKK_64
2441
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
2442
+
2443
+ int v[2];
2444
+ int u[2*QR4_K];
2445
+ float d8[QR4_K];
2446
+
2447
+ // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
2448
+ const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
2449
+
2450
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
2451
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
2452
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
2453
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
2454
+
2455
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
2456
+ v[0] = q4[0];
2457
+ v[1] = q4[4];
2458
+
2459
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales;
2460
+ uint16_t aux[2];
2461
+ const int j = bq8_offset/2;
2462
+ if (j < 2) {
2463
+ aux[0] = scales[j+0] & 0x3f3f;
2464
+ aux[1] = scales[j+2] & 0x3f3f;
2465
+ } else {
2466
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
2467
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
2468
+ }
2469
+ const uint8_t * sc = (const uint8_t *)aux;
2470
+ const uint8_t * m = sc + 2;
2471
+
2472
+ for (int i = 0; i < QR4_K; ++i) {
2473
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2474
+ d8[i] = bq8i->ds.x;
2475
+
2476
+ const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2477
+ u[2*i+0] = q8[0];
2478
+ u[2*i+1] = q8[4];
2479
+ }
2480
+
2481
+ return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8);
2482
+
2483
+ #else
2484
+
2485
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2486
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
2487
+
2488
+ float sumf_d = 0.0f;
2489
+ float sumf_m = 0.0f;
2490
+
2491
+ uint16_t aux16[2];
2492
+ const uint8_t * s = (const uint8_t *)aux16;
2493
+
2494
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
2495
+ aux16[0] = a[0] & 0x0f0f;
2496
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
2497
+
2498
+ const float dall = bq4_K->d[0];
2499
+ const float dmin = bq4_K->d[1];
2500
+
2501
+ const float d8_1 = bq8_1[0].ds.x;
2502
+ const float d8_2 = bq8_1[1].ds.x;
2503
+
2504
+ const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2505
+ const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
2506
+ const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
2507
+ const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
2508
+
2509
+ const int * q4 = (const int *)bq4_K->qs + (iqs/2);
2510
+ const int v1 = q4[0];
2511
+ const int v2 = q4[4];
2512
+
2513
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
2514
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
2515
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
2516
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
2517
+
2518
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
2519
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
2520
+
2521
+ return dall * sumf_d - dmin * sumf_m;
2522
+
2523
+ #else
2524
+ return 0.0f; // only to satisfy the compiler
2525
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2526
+
2527
+ #endif
2528
+ }
2529
+
2530
+ static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2531
+
2532
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2533
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
2534
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2535
+
2536
+ *x_ql = tile_x_ql;
2537
+ *x_dm = tile_x_dm;
2538
+ *x_sc = tile_x_sc;
2539
+ }
2540
+
2541
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2542
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2543
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2544
+
2545
+ __builtin_assume(i_offset >= 0);
2546
+ __builtin_assume(i_offset < 8);
2547
+ __builtin_assume(k >= 0);
2548
+ __builtin_assume(k < WARP_SIZE);
2549
+
2550
+ const int kbx = k / QI4_K; // == 0 if QK_K == 256
2551
+ const int kqsx = k % QI4_K; // == k if QK_K == 256
2552
+
2553
+ const block_q4_K * bx0 = (block_q4_K *) vx;
2554
+
2555
+ #pragma unroll
2556
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2557
+ int i = i0 + i_offset;
2558
+
2559
+ if (need_check) {
2560
+ i = min(i, i_max);
2561
+ }
2562
+
2563
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
2564
+
2565
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2566
+ }
2567
+
2568
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2569
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2570
+
2571
+ #pragma unroll
2572
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
2573
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2574
+
2575
+ if (need_check) {
2576
+ i = min(i, i_max);
2577
+ }
2578
+
2579
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2580
+
2581
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2582
+ }
2583
+
2584
+ #pragma unroll
2585
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2586
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2587
+
2588
+ if (need_check) {
2589
+ i = min(i, i_max);
2590
+ }
2591
+
2592
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2593
+
2594
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
2595
+ }
2596
+ }
2597
+
2598
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2599
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2600
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2601
+
2602
+ __builtin_assume(i >= 0);
2603
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2604
+ __builtin_assume(j >= 0);
2605
+ __builtin_assume(j < WARP_SIZE);
2606
+ __builtin_assume(k >= 0);
2607
+ __builtin_assume(k < WARP_SIZE);
2608
+
2609
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
2610
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
2611
+
2612
+ int v[2];
2613
+ int u[2*QR4_K];
2614
+ float d8[QR4_K];
2615
+
2616
+ // kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6
2617
+ const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
2618
+
2619
+ v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2620
+ v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2621
+
2622
+ const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2623
+ uint16_t aux[2];
2624
+ const int l = bq8_offset/2;
2625
+ if (l < 2) {
2626
+ aux[0] = scales[l+0] & 0x3f3f;
2627
+ aux[1] = scales[l+2] & 0x3f3f;
2628
+ } else {
2629
+ aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2630
+ aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2631
+ }
2632
+ const uint8_t * sc = (const uint8_t *)aux;
2633
+ const uint8_t * m = sc + 2;
2634
+
2635
+ for (int l = 0; l < QR4_K; ++l) {
2636
+ const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2637
+ u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2638
+ u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2639
+ d8[l] = y_ds[kqsy / QI8_1].x;
2640
+ }
2641
+
2642
+ return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
2643
+ }
2644
+
2645
+ #define VDR_q5_K_q8_1 2
2646
+
2647
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
2648
+ const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2649
+ const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
2650
+
2651
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2652
+ float sumf_d = 0.0f;
2653
+ float sumf_m = 0.0f;
2654
+
2655
+ for (int i = 0; i < QR5_K; ++i) {
2656
+ const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
2657
+ const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
2658
+
2659
+ const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
2660
+ const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
2661
+
2662
+ const int v0i = vl0i | vh0i;
2663
+ const int v1i = vl1i | vh1i;
2664
+
2665
+ const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
2666
+ const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
2667
+
2668
+ sumf_d += d8[i] * (dot1 * sc[i]);
2669
+ sumf_m += d8[i] * (dot2 * m[i]);
2670
+
2671
+ }
2672
+
2673
+ return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m;
2674
+
2675
+ #else
2676
+ return 0.0f; // only to satisfy the compiler
2677
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2678
+ }
2679
+
2680
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2681
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2682
+
2683
+ #ifndef GGML_QKK_64
2684
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
2685
+
2686
+ int vl[2];
2687
+ int vh[2];
2688
+ int u[2*QR5_K];
2689
+ float d8[QR5_K];
2690
+
2691
+ const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
2692
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
2693
+ const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
2694
+
2695
+ vl[0] = ql[0];
2696
+ vl[1] = ql[4];
2697
+
2698
+ vh[0] = qh[0] >> bq8_offset;
2699
+ vh[1] = qh[4] >> bq8_offset;
2700
+
2701
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales;
2702
+ uint16_t aux[2];
2703
+ const int j = bq8_offset/2;
2704
+ if (j < 2) {
2705
+ aux[0] = scales[j+0] & 0x3f3f;
2706
+ aux[1] = scales[j+2] & 0x3f3f;
2707
+ } else {
2708
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
2709
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
2710
+ }
2711
+ const uint8_t * sc = (const uint8_t *)aux;
2712
+ const uint8_t * m = sc + 2;
2713
+
2714
+ for (int i = 0; i < QR5_K; ++i) {
2715
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2716
+ d8[i] = bq8i->ds.x;
2717
+
2718
+ const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2719
+ u[2*i+0] = q8[0];
2720
+ u[2*i+1] = q8[4];
2721
+ }
2722
+
2723
+ return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
2724
+
2725
+ #else
2726
+
2727
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2728
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
2729
+
2730
+ const int8_t * s = bq5_K->scales;
2731
+
2732
+ const float d = bq5_K->d;
2733
+
2734
+ const float d8_1 = bq8_1[0].ds.x;
2735
+ const float d8_2 = bq8_1[1].ds.x;
2736
+
2737
+ const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2738
+ const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
2739
+ const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
2740
+ const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
2741
+
2742
+ const int * ql = (const int *)bq5_K->qs + (iqs/2);
2743
+ const int vl1 = ql[0];
2744
+ const int vl2 = ql[4];
2745
+
2746
+ const int step = 4 * (iqs/2); // 0, 4, 8, 12
2747
+ const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
2748
+ const int in = step%8; // 0, 4, 0, 4
2749
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
2750
+
2751
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
2752
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
2753
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
2754
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
2755
+
2756
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
2757
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
2758
+
2759
+ return d * sumf_d;
2760
+
2761
+ #else
2762
+ return 0.0f; // only to satisfy the compiler
2763
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2764
+
2765
+ #endif
2766
+ }
2767
+
2768
+ static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2769
+
2770
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2771
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
2772
+ __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2773
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2774
+
2775
+ *x_ql = tile_x_ql;
2776
+ *x_dm = tile_x_dm;
2777
+ *x_qh = tile_x_qh;
2778
+ *x_sc = tile_x_sc;
2779
+ }
2780
+
2781
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2782
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2783
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2784
+
2785
+ __builtin_assume(i_offset >= 0);
2786
+ __builtin_assume(i_offset < 8);
2787
+ __builtin_assume(k >= 0);
2788
+ __builtin_assume(k < WARP_SIZE);
2789
+
2790
+ const int kbx = k / QI5_K; // == 0 if QK_K == 256
2791
+ const int kqsx = k % QI5_K; // == k if QK_K == 256
2792
+
2793
+ const block_q5_K * bx0 = (block_q5_K *) vx;
2794
+
2795
+ #pragma unroll
2796
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2797
+ int i = i0 + i_offset;
2798
+
2799
+ if (need_check) {
2800
+ i = min(i, i_max);
2801
+ }
2802
+
2803
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
2804
+
2805
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2806
+ }
2807
+
2808
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
2809
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2810
+
2811
+ #pragma unroll
2812
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
2813
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2814
+
2815
+ if (need_check) {
2816
+ i = min(i, i_max);
2817
+ }
2818
+
2819
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
2820
+
2821
+ x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
2822
+ }
2823
+
2824
+ #pragma unroll
2825
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2826
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2827
+
2828
+ if (need_check) {
2829
+ i = min(i, i_max);
2830
+ }
2831
+
2832
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
2833
+
2834
+ x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
2835
+ }
2836
+
2837
+ #pragma unroll
2838
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2839
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2840
+
2841
+ if (need_check) {
2842
+ i = min(i, i_max);
2843
+ }
2844
+
2845
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2846
+
2847
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
2848
+ }
2849
+ }
2850
+
2851
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
2852
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2853
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2854
+
2855
+ __builtin_assume(i >= 0);
2856
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
2857
+ __builtin_assume(j >= 0);
2858
+ __builtin_assume(j < WARP_SIZE);
2859
+ __builtin_assume(k >= 0);
2860
+ __builtin_assume(k < WARP_SIZE);
2861
+
2862
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
2863
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
2864
+
2865
+ int vl[2];
2866
+ int vh[2];
2867
+ int u[2*QR4_K];
2868
+ float d8[QR4_K];
2869
+
2870
+ const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
2871
+
2872
+ vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2873
+ vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2874
+
2875
+ vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
2876
+ vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
2877
+
2878
+ const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2879
+ uint16_t aux[2];
2880
+ const int l = bq8_offset/2;
2881
+ if (l < 2) {
2882
+ aux[0] = scales[l+0] & 0x3f3f;
2883
+ aux[1] = scales[l+2] & 0x3f3f;
2884
+ } else {
2885
+ aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2886
+ aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2887
+ }
2888
+ const uint8_t * sc = (const uint8_t *)aux;
2889
+ const uint8_t * m = sc + 2;
2890
+
2891
+ for (int l = 0; l < QR5_K; ++l) {
2892
+ const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2893
+ u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2894
+ u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2895
+ d8[l] = y_ds[kqsy / QI8_1].x;
2896
+ }
2897
+
2898
+ return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
2899
+ }
2900
+
2901
+ #define VDR_q6_K_q8_1 1
2902
+
2903
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
2904
+ const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
2905
+ const float & d, const float * __restrict__ d8) {
2906
+
2907
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2908
+ float sumf = 0.0f;
2909
+
2910
+ for (int i = 0; i < QR6_K; ++i) {
2911
+ const int sc = scales[4*i];
2912
+
2913
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
2914
+
2915
+ const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
2916
+
2917
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
2918
+
2919
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2920
+ }
2921
+
2922
+ return d*sumf;
2923
+ #else
2924
+ return 0.0f; // only to satisfy the compiler
2925
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2926
+ }
2927
+
2928
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
2929
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2930
+
2931
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
2932
+
2933
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
2934
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
2935
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
1601
2936
 
1602
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1603
- const float d8i = bq8i->d;
1604
- const int * q8 = (const int *)bq8i->qs + (iqs%4);
1605
- const int ui1 = q8[0];
1606
- const int ui2 = q8[4];
2937
+ const int vl = get_int_from_uint8(bq6_K->ql, iqs);
2938
+ const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
1607
2939
 
1608
- const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
1609
- const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
2940
+ const int8_t * scales = bq6_K->scales + scale_offset;
1610
2941
 
1611
- const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1612
- const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
2942
+ int u[QR6_K];
2943
+ float d8[QR6_K];
1613
2944
 
1614
- sumf_d += d8i * (dot1 * sc[i]);
1615
- sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
2945
+ for (int i = 0; i < QR6_K; ++i) {
2946
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
2947
+ d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
1616
2948
  }
1617
2949
 
1618
- return d*sumf_d - dmin*sumf_m;
2950
+ return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8);
2951
+ }
1619
2952
 
1620
- #else
2953
+ static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1621
2954
 
1622
- uint16_t aux16[2];
1623
- const uint8_t * s = (const uint8_t *)aux16;
2955
+ __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2956
+ __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
2957
+ __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2958
+ __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
1624
2959
 
1625
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
1626
- aux16[0] = a[0] & 0x0f0f;
1627
- aux16[1] = (a[0] >> 4) & 0x0f0f;
2960
+ *x_ql = tile_x_ql;
2961
+ *x_dm = tile_x_dm;
2962
+ *x_qh = tile_x_qh;
2963
+ *x_sc = tile_x_sc;
2964
+ }
1628
2965
 
1629
- const float dall = bq4_K->d[0];
1630
- const float dmin = bq4_K->d[1];
2966
+ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
2967
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2968
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1631
2969
 
1632
- const float d8_1 = bq8_1[0].d;
1633
- const float d8_2 = bq8_1[1].d;
2970
+ __builtin_assume(i_offset >= 0);
2971
+ __builtin_assume(i_offset < 8);
2972
+ __builtin_assume(k >= 0);
2973
+ __builtin_assume(k < WARP_SIZE);
1634
2974
 
1635
- const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1636
- const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1637
- const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1638
- const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
2975
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
2976
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
1639
2977
 
1640
- const int * q4 = (const int *)bq4_K->qs + iqs;
1641
- const int v1 = q4[0];
1642
- const int v2 = q4[4];
2978
+ const block_q6_K * bx0 = (block_q6_K *) vx;
1643
2979
 
1644
- const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
1645
- const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
1646
- const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1647
- const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
2980
+ #pragma unroll
2981
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2982
+ int i = i0 + i_offset;
1648
2983
 
1649
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
1650
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
2984
+ if (need_check) {
2985
+ i = min(i, i_max);
2986
+ }
1651
2987
 
1652
- return dall * sumf_d - dmin * sumf_m;
2988
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
1653
2989
 
1654
- #endif
2990
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx);
2991
+ }
1655
2992
 
1656
- #else
1657
- return 0.0f; // only to satisfy the compiler
1658
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1659
- }
2993
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
2994
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
1660
2995
 
1661
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1662
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
2996
+ #pragma unroll
2997
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
2998
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
1663
2999
 
1664
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1665
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
3000
+ if (need_check) {
3001
+ i = min(i, i_max);
3002
+ }
1666
3003
 
1667
- #ifndef GGML_QKK_64
3004
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
1668
3005
 
1669
- const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
1670
- const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1671
- const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
3006
+ x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d;
3007
+ }
1672
3008
 
1673
- float sumf_d = 0.0f;
1674
- float sumf_m = 0.0f;
3009
+ #pragma unroll
3010
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
3011
+ int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
1675
3012
 
1676
- const float d = bq5_K->d;
1677
- const float dmin = bq5_K->dmin;
3013
+ if (need_check) {
3014
+ i = min(i, i_max);
3015
+ }
1678
3016
 
1679
- const int vl1 = ql[0];
1680
- const int vl2 = ql[4];
3017
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2);
1681
3018
 
1682
- const int vh1 = qh[0] >> bq8_offset;
1683
- const int vh2 = qh[4] >> bq8_offset;
3019
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2));
3020
+ }
1684
3021
 
1685
- const uint16_t * scales = (const uint16_t *)bq5_K->scales;
1686
- uint16_t aux[2];
1687
- const int j = bq8_offset/2;
1688
- if (j < 2) {
1689
- aux[0] = scales[j+0] & 0x3f3f;
1690
- aux[1] = scales[j+2] & 0x3f3f;
1691
- } else {
1692
- aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1693
- aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
3022
+ #pragma unroll
3023
+ for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
3024
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
3025
+
3026
+ if (need_check) {
3027
+ i = min(i, i_max);
3028
+ }
3029
+
3030
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
3031
+
3032
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
1694
3033
  }
1695
- const uint8_t * sc = (const uint8_t *)aux;
1696
- const uint8_t * m = sc + 2;
3034
+ }
1697
3035
 
1698
- for (int i = 0; i < QR5_K; ++i) {
3036
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3037
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3038
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1699
3039
 
1700
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1701
- const float d8i = bq8i->d;
1702
- const int * q8 = (const int *)bq8i->qs + (iqs%4);
1703
- const int ui1 = q8[0];
1704
- const int ui2 = q8[4];
3040
+ __builtin_assume(i >= 0);
3041
+ __builtin_assume(i < GGML_CUDA_MMQ_Y);
3042
+ __builtin_assume(j >= 0);
3043
+ __builtin_assume(j < WARP_SIZE);
3044
+ __builtin_assume(k >= 0);
3045
+ __builtin_assume(k < WARP_SIZE);
1705
3046
 
1706
- const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
1707
- const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
3047
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
3048
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
1708
3049
 
1709
- const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
1710
- const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
3050
+ const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
3051
+ const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
3052
+ const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
1711
3053
 
1712
- const int vi1 = vil1 | vih1;
1713
- const int vi2 = vil2 | vih2;
3054
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
1714
3055
 
1715
- const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1716
- const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
3056
+ const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
3057
+ const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
1717
3058
 
1718
- sumf_d += d8i * (dot1 * sc[i]);
1719
- sumf_m += d8i * (dot2 * m[i]);
3059
+ int u[QR6_K];
3060
+ float d8[QR6_K];
1720
3061
 
3062
+ for (int l = 0; l < QR6_K; ++l) {
3063
+ const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
3064
+ u[l] = y_qs[kqsy];
3065
+ d8[l] = y_ds[kqsy / QI8_1].x;
1721
3066
  }
1722
3067
 
1723
- return d*sumf_d - dmin*sumf_m;
1724
-
1725
- #else
3068
+ return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
3069
+ x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
3070
+ }
1726
3071
 
1727
- const int8_t * s = bq5_K->scales;
3072
+ template <int qk, int qr, int qi, typename block_q_t,
3073
+ allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3074
+ static __global__ void mul_mat_q(
3075
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3076
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1728
3077
 
1729
- const float d = bq5_K->d;
3078
+ const block_q_t * x = (const block_q_t *) vx;
3079
+ const block_q8_1 * y = (const block_q8_1 *) vy;
1730
3080
 
1731
- const float d8_1 = bq8_1[0].d;
1732
- const float d8_2 = bq8_1[1].d;
3081
+ const int blocks_per_row_x = ncols_x / qk;
3082
+ const int blocks_per_col_y = nrows_y / QK8_1;
3083
+ const int blocks_per_warp = WARP_SIZE / qi;
1733
3084
 
1734
- const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1735
- const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1736
- const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1737
- const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
3085
+ const int & ncols_dst = ncols_y;
1738
3086
 
1739
- const int * ql = (const int *)bq5_K->qs + iqs;
1740
- const int vl1 = ql[0];
1741
- const int vl2 = ql[4];
3087
+ const int tid_x = threadIdx.x;
3088
+ const int tid_y = threadIdx.y;
1742
3089
 
1743
- const int step = 4 * iqs; // 0, 4, 8, 12
1744
- const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
1745
- const int in = step%8; // 0, 4, 0, 4
1746
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
3090
+ const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
3091
+ const int & row_x_0 = row_dst_0;
3092
+ const int row_dst = row_dst_0 + tid_x;
1747
3093
 
1748
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
1749
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
1750
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
1751
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
3094
+ const int col_dst_0 = blockIdx.y*WARP_SIZE;
3095
+ const int & col_y_0 = col_dst_0;
1752
3096
 
1753
- const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
1754
- + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
3097
+ int * tile_x_ql = nullptr;
3098
+ half2 * tile_x_dm = nullptr;
3099
+ int * tile_x_qh = nullptr;
3100
+ int * tile_x_sc = nullptr;
1755
3101
 
1756
- return d * sumf_d;
3102
+ allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
1757
3103
 
1758
- #endif
3104
+ const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
1759
3105
 
1760
- #else
1761
- return 0.0f; // only to satisfy the compiler
1762
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1763
- }
3106
+ __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
3107
+ __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
1764
3108
 
1765
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1766
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
3109
+ float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
1767
3110
 
1768
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1769
- const block_q6_K * bq6_K = (const block_q6_K *) vbq;
3111
+ for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
1770
3112
 
1771
- const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1772
- const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1773
- const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
3113
+ load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
3114
+ tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
1774
3115
 
1775
- float sumf = 0.0f;
3116
+ for (int ir = 0; ir < qr; ++ir) {
3117
+ const int kqs = ir*WARP_SIZE + tid_x;
3118
+ const int kbxd = kqs / QI8_1;
1776
3119
 
1777
- const float d = bq6_K->d;
3120
+ for (int i = 0; i < WARP_SIZE; i += 8) {
3121
+ const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
1778
3122
 
1779
- int vl;
1780
- memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
3123
+ const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
1781
3124
 
1782
- int vh;
1783
- memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
3125
+ tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
3126
+ }
3127
+ }
1784
3128
 
1785
- for (int i = 0; i < QR6_K; ++i) {
1786
- const int sc = bq6_K->scales[scale_offset + 4*i];
3129
+ for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
3130
+ const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
3131
+ const int kby = tid_x % blocks_per_tile_y_col;
3132
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3133
+ tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
3134
+ }
1787
3135
 
1788
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
1789
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
1790
- const float d8i = bq8i->d;
3136
+ __syncthreads();
1791
3137
 
1792
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
3138
+ #if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
3139
+ #pragma unroll
3140
+ #endif // __CUDA_ARCH__ >= 700
3141
+ for (int k = 0; k < WARP_SIZE; k += vdr) {
3142
+ #pragma unroll
3143
+ for (int j = 0; j < WARP_SIZE; j += 8) {
3144
+ #pragma unroll
3145
+ for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3146
+ sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3147
+ tid_x + i, tid_y + j, k);
3148
+ }
3149
+ }
3150
+ }
1793
3151
 
1794
- const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
3152
+ __syncthreads();
3153
+ }
1795
3154
 
1796
- const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1797
3155
 
1798
- sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
3156
+ if (row_dst >= nrows_dst) {
3157
+ return;
1799
3158
  }
1800
3159
 
1801
- return d*sumf;
1802
- #else
1803
- return 0.0f; // only to satisfy the compiler
1804
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3160
+ for (int j = 0; j < WARP_SIZE; j += 8) {
3161
+ const int col_dst = col_dst_0 + j + tid_y;
3162
+
3163
+ if (col_dst >= ncols_dst) {
3164
+ return;
3165
+ }
3166
+
3167
+ for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3168
+ dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
3169
+ }
3170
+ }
1805
3171
  }
1806
3172
 
1807
- template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
3173
+ template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
1808
3174
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1809
3175
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1810
3176
 
@@ -1813,7 +3179,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1813
3179
  }
1814
3180
 
1815
3181
  const int blocks_per_row = ncols / qk;
1816
- const int blocks_per_warp = WARP_SIZE / qi;
3182
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
1817
3183
 
1818
3184
  // partial sum for each thread
1819
3185
  float tmp = 0.0f;
@@ -1822,11 +3188,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1822
3188
  const block_q8_1 * y = (const block_q8_1 *) vy;
1823
3189
 
1824
3190
  for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1825
- const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
3191
+ const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
1826
3192
 
1827
- const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
3193
+ const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
1828
3194
 
1829
- const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
3195
+ const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
1830
3196
 
1831
3197
  tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
1832
3198
  }
@@ -1859,11 +3225,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1859
3225
  const int y_offset = qr == 1 ? 1 : qk/2;
1860
3226
 
1861
3227
  // partial sum for each thread
1862
- #ifdef GGML_CUDA_DMMV_F16
3228
+ #ifdef GGML_CUDA_F16
1863
3229
  half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
1864
3230
  #else
1865
3231
  float tmp = 0.0f;
1866
- #endif // GGML_CUDA_DMMV_F16
3232
+ #endif // GGML_CUDA_F16
1867
3233
 
1868
3234
  for (int i = 0; i < ncols; i += iter_stride) {
1869
3235
  const int col = i + vals_per_iter*tid;
@@ -1883,7 +3249,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1883
3249
 
1884
3250
  // matrix multiplication
1885
3251
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
1886
- #ifdef GGML_CUDA_DMMV_F16
3252
+ #ifdef GGML_CUDA_F16
1887
3253
  tmp += __hmul2(v, {
1888
3254
  y[iybs + iqs + j/qr + 0],
1889
3255
  y[iybs + iqs + j/qr + y_offset]
@@ -1891,7 +3257,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1891
3257
  #else
1892
3258
  tmp += v.x * y[iybs + iqs + j/qr + 0];
1893
3259
  tmp += v.y * y[iybs + iqs + j/qr + y_offset];
1894
- #endif // GGML_CUDA_DMMV_F16
3260
+ #endif // GGML_CUDA_F16
1895
3261
  }
1896
3262
  }
1897
3263
 
@@ -1902,11 +3268,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1902
3268
  }
1903
3269
 
1904
3270
  if (tid == 0) {
1905
- #ifdef GGML_CUDA_DMMV_F16
3271
+ #ifdef GGML_CUDA_F16
1906
3272
  dst[row] = tmp.x + tmp.y;
1907
3273
  #else
1908
3274
  dst[row] = tmp;
1909
- #endif // GGML_CUDA_DMMV_F16
3275
+ #endif // GGML_CUDA_F16
1910
3276
  }
1911
3277
  }
1912
3278
 
@@ -2046,7 +3412,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
2046
3412
  }
2047
3413
 
2048
3414
  // rope == RoPE == rotary positional embedding
2049
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
3415
+ static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3416
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
2050
3417
  const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
2051
3418
 
2052
3419
  if (col >= ncols) {
@@ -2056,7 +3423,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
2056
3423
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
2057
3424
  const int i = row*ncols + col;
2058
3425
 
2059
- const float theta = p*powf(theta_scale, col/2);
3426
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
2060
3427
  const float sin_theta = sinf(theta);
2061
3428
  const float cos_theta = cosf(theta);
2062
3429
 
@@ -2203,9 +3570,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
2203
3570
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
2204
3571
  }
2205
3572
 
2206
- static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
2207
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
2208
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
3573
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
3574
+ const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
3575
+ const dim3 num_blocks(block_num_x, ky, 1);
3576
+ const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
3577
+ quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
2209
3578
  }
2210
3579
 
2211
3580
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2366,7 +3735,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
2366
3735
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2367
3736
  const dim3 block_nums(1, block_num_y, 1);
2368
3737
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2369
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
3738
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
2370
3739
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2371
3740
  }
2372
3741
 
@@ -2375,7 +3744,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
2375
3744
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2376
3745
  const dim3 block_nums(1, block_num_y, 1);
2377
3746
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2378
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
3747
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
2379
3748
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2380
3749
  }
2381
3750
 
@@ -2384,7 +3753,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
2384
3753
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2385
3754
  const dim3 block_nums(1, block_num_y, 1);
2386
3755
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2387
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
3756
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
2388
3757
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2389
3758
  }
2390
3759
 
@@ -2393,7 +3762,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
2393
3762
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2394
3763
  const dim3 block_nums(1, block_num_y, 1);
2395
3764
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2396
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
3765
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
2397
3766
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2398
3767
  }
2399
3768
 
@@ -2402,7 +3771,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
2402
3771
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2403
3772
  const dim3 block_nums(1, block_num_y, 1);
2404
3773
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2405
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
3774
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
2406
3775
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2407
3776
  }
2408
3777
 
@@ -2411,7 +3780,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
2411
3780
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2412
3781
  const dim3 block_nums(1, block_num_y, 1);
2413
3782
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2414
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
3783
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1>
2415
3784
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2416
3785
  }
2417
3786
 
@@ -2420,7 +3789,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
2420
3789
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2421
3790
  const dim3 block_nums(1, block_num_y, 1);
2422
3791
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2423
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
3792
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1>
2424
3793
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2425
3794
  }
2426
3795
 
@@ -2429,10 +3798,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
2429
3798
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2430
3799
  const dim3 block_nums(1, block_num_y, 1);
2431
3800
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2432
- // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
2433
- // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2434
- // is better amortized.
2435
- mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
3801
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1>
2436
3802
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2437
3803
  }
2438
3804
 
@@ -2441,10 +3807,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
2441
3807
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2442
3808
  const dim3 block_nums(1, block_num_y, 1);
2443
3809
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2444
- // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
2445
- // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2446
- // is better amortized.
2447
- mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
3810
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1>
2448
3811
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2449
3812
  }
2450
3813
 
@@ -2453,7 +3816,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
2453
3816
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2454
3817
  const dim3 block_nums(1, block_num_y, 1);
2455
3818
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2456
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
3819
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1>
2457
3820
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2458
3821
  }
2459
3822
 
@@ -2500,6 +3863,186 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
2500
3863
  }
2501
3864
  }
2502
3865
 
3866
+ static void ggml_mul_mat_q4_0_q8_1_cuda(
3867
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3868
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3869
+
3870
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3871
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3872
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3873
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3874
+
3875
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3876
+ mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3877
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3878
+ } else {
3879
+ mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3880
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3881
+ }
3882
+ }
3883
+
3884
+ static void ggml_mul_mat_q4_1_q8_1_cuda(
3885
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3886
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3887
+
3888
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3889
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3890
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3891
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3892
+
3893
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3894
+ mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3895
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3896
+ } else {
3897
+ mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3898
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3899
+ }
3900
+ }
3901
+
3902
+ static void ggml_mul_mat_q5_0_q8_1_cuda(
3903
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3904
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3905
+
3906
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3907
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3908
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3909
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3910
+
3911
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3912
+ mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3913
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3914
+ } else {
3915
+ mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3916
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3917
+ }
3918
+ }
3919
+
3920
+ static void ggml_mul_mat_q5_1_q8_1_cuda(
3921
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3922
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3923
+
3924
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3925
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3926
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3927
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3928
+
3929
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3930
+ mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3931
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3932
+ } else {
3933
+ mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3934
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3935
+ }
3936
+ }
3937
+
3938
+ static void ggml_mul_mat_q8_0_q8_1_cuda(
3939
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3940
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3941
+
3942
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3943
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3944
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3945
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3946
+
3947
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3948
+ mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3949
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3950
+ } else {
3951
+ mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3952
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3953
+ }
3954
+ }
3955
+
3956
+ static void ggml_mul_mat_q2_K_q8_1_cuda(
3957
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3958
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3959
+
3960
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3961
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3962
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3963
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3964
+
3965
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3966
+ mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3967
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3968
+ } else {
3969
+ mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3970
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3971
+ }
3972
+ }
3973
+
3974
+ static void ggml_mul_mat_q3_K_q8_1_cuda(
3975
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3976
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3977
+
3978
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3979
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3980
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3981
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3982
+
3983
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3984
+ mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3985
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
+ } else {
3987
+ mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3988
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3989
+ }
3990
+ }
3991
+
3992
+ static void ggml_mul_mat_q4_K_q8_1_cuda(
3993
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3994
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3995
+
3996
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3997
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3998
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3999
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4000
+
4001
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4002
+ mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4003
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4004
+ } else {
4005
+ mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4006
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4007
+ }
4008
+ }
4009
+
4010
+ static void ggml_mul_mat_q5_K_q8_1_cuda(
4011
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4012
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4013
+
4014
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4015
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4016
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4017
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4018
+
4019
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4020
+ mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4021
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4022
+ } else {
4023
+ mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4024
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4025
+ }
4026
+ }
4027
+
4028
+ static void ggml_mul_mat_q6_K_q8_1_cuda(
4029
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4030
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4031
+
4032
+ const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4033
+ const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4034
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4035
+ const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4036
+
4037
+ if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4038
+ mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4039
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4040
+ } else {
4041
+ mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4042
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4043
+ }
4044
+ }
4045
+
2503
4046
  static void ggml_mul_mat_p021_f16_f32_cuda(
2504
4047
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
2505
4048
  const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
@@ -2544,12 +4087,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
2544
4087
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
2545
4088
  }
2546
4089
 
2547
- static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
4090
+ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4091
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
2548
4092
  GGML_ASSERT(nrows % 2 == 0);
2549
4093
  const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2550
4094
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
2551
4095
  const dim3 block_nums(num_blocks_x, nrows, 1);
2552
- rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
4096
+ rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
2553
4097
  }
2554
4098
 
2555
4099
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
@@ -2676,10 +4220,9 @@ static size_t g_scratch_offset = 0;
2676
4220
 
2677
4221
  static int g_device_count = -1;
2678
4222
  static int g_main_device = 0;
2679
- #ifndef GGML_CUDA_FORCE_DMMV
2680
4223
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
2681
- #endif
2682
4224
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
4225
+ static bool g_mul_mat_q = false;
2683
4226
 
2684
4227
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
2685
4228
 
@@ -2701,9 +4244,7 @@ void ggml_init_cublas() {
2701
4244
  g_tensor_split[id] = total_vram;
2702
4245
  total_vram += prop.totalGlobalMem;
2703
4246
 
2704
- #ifndef GGML_CUDA_FORCE_DMMV
2705
4247
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
2706
- #endif
2707
4248
  }
2708
4249
  for (int id = 0; id < g_device_count; ++id) {
2709
4250
  g_tensor_split[id] /= total_vram;
@@ -2965,6 +4506,83 @@ inline void ggml_cuda_op_rms_norm(
2965
4506
  (void) i1;
2966
4507
  }
2967
4508
 
4509
+ inline void ggml_cuda_op_mul_mat_q(
4510
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
4511
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
4512
+ cudaStream_t & cudaStream_main){
4513
+
4514
+ GGML_ASSERT(src0_ddq_i != nullptr);
4515
+ GGML_ASSERT(src1_ddf_i != nullptr);
4516
+ GGML_ASSERT(dst_ddf_i != nullptr);
4517
+
4518
+ const int64_t ne00 = src0->ne[0];
4519
+
4520
+ const int64_t ne10 = src1->ne[0];
4521
+ const int64_t ne11 = src1->ne[1];
4522
+ GGML_ASSERT(ne10 % QK8_1 == 0);
4523
+
4524
+ const int64_t ne0 = dst->ne[0];
4525
+
4526
+ const int64_t i01_diff = i01_high - i01_low;
4527
+
4528
+ int id;
4529
+ CUDA_CHECK(cudaGetDevice(&id));
4530
+
4531
+ // the main device has a larger memory buffer to hold the results from all GPUs
4532
+ // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
4533
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
4534
+
4535
+ const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
4536
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
4537
+ size_t as;
4538
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
4539
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
4540
+
4541
+ switch (src0->type) {
4542
+ case GGML_TYPE_Q4_0:
4543
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4544
+ break;
4545
+ case GGML_TYPE_Q4_1:
4546
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4547
+ break;
4548
+ case GGML_TYPE_Q5_0:
4549
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4550
+ break;
4551
+ case GGML_TYPE_Q5_1:
4552
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4553
+ break;
4554
+ case GGML_TYPE_Q8_0:
4555
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4556
+ break;
4557
+ case GGML_TYPE_Q2_K:
4558
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4559
+ break;
4560
+ case GGML_TYPE_Q3_K:
4561
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4562
+ break;
4563
+ case GGML_TYPE_Q4_K:
4564
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4565
+ break;
4566
+ case GGML_TYPE_Q5_K:
4567
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4568
+ break;
4569
+ case GGML_TYPE_Q6_K:
4570
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4571
+ break;
4572
+ default:
4573
+ GGML_ASSERT(false);
4574
+ break;
4575
+ }
4576
+
4577
+ ggml_cuda_pool_free(src1_q8_1, as);
4578
+
4579
+ (void) src1;
4580
+ (void) dst;
4581
+ (void) src0_ddf_i;
4582
+ (void) i02;
4583
+ (void) i1;
4584
+ }
4585
+
2968
4586
  inline void ggml_cuda_op_mul_mat_vec(
2969
4587
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2970
4588
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2979,6 +4597,7 @@ inline void ggml_cuda_op_mul_mat_vec(
2979
4597
 
2980
4598
  #ifdef GGML_CUDA_FORCE_DMMV
2981
4599
  const bool use_mul_mat_vec_q = false;
4600
+ (void) g_compute_capabilities[0];
2982
4601
  #else
2983
4602
  int id;
2984
4603
  CUDA_CHECK(cudaGetDevice(&id));
@@ -3006,7 +4625,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3006
4625
  ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
3007
4626
  size_t as;
3008
4627
  void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
3009
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
4628
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
3010
4629
 
3011
4630
  switch (src0->type) {
3012
4631
  case GGML_TYPE_Q4_0:
@@ -3047,7 +4666,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3047
4666
  ggml_cuda_pool_free(src1_q8_1, as);
3048
4667
  } else {
3049
4668
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
3050
- #ifdef GGML_CUDA_DMMV_F16
4669
+ #ifdef GGML_CUDA_F16
3051
4670
  size_t ash;
3052
4671
  dfloat * src1_dfloat = nullptr; // dfloat == half
3053
4672
 
@@ -3063,7 +4682,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3063
4682
  }
3064
4683
  #else
3065
4684
  dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
3066
- #endif // GGML_CUDA_DMMV_F16
4685
+ #endif // GGML_CUDA_F16
3067
4686
 
3068
4687
  switch (src0->type) {
3069
4688
  case GGML_TYPE_Q4_0:
@@ -3104,11 +4723,11 @@ inline void ggml_cuda_op_mul_mat_vec(
3104
4723
  break;
3105
4724
  }
3106
4725
 
3107
- #ifdef GGML_CUDA_DMMV_F16
4726
+ #ifdef GGML_CUDA_F16
3108
4727
  if (src1_convert_f16) {
3109
4728
  ggml_cuda_pool_free(src1_dfloat, ash);
3110
4729
  }
3111
- #endif // GGML_CUDA_DMMV_F16
4730
+ #endif // GGML_CUDA_F16
3112
4731
  }
3113
4732
 
3114
4733
  (void) src1;
@@ -3168,6 +4787,7 @@ inline void ggml_cuda_op_rope(
3168
4787
  GGML_ASSERT(dst_ddf_i != nullptr);
3169
4788
 
3170
4789
  const int64_t ne00 = src0->ne[0];
4790
+ const int64_t ne01 = src0->ne[1];
3171
4791
  const int64_t i01_diff = i01_high - i01_low;
3172
4792
 
3173
4793
  const int n_past = ((int32_t *) dst->op_params)[0];
@@ -3181,17 +4801,18 @@ inline void ggml_cuda_op_rope(
3181
4801
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
3182
4802
 
3183
4803
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
3184
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3185
4804
 
3186
- bool is_glm = mode & 4;
4805
+ const bool is_glm = mode & 4;
3187
4806
 
3188
4807
  // compute
3189
4808
  if (is_glm) {
4809
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3190
4810
  const float id_p = min(p, n_ctx - 2.f);
3191
4811
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
3192
4812
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
3193
4813
  } else {
3194
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
4814
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
4815
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
3195
4816
  }
3196
4817
 
3197
4818
  (void) src1;
@@ -3363,7 +4984,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3363
4984
  int64_t row_low, row_high;
3364
4985
  if (split) {
3365
4986
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
3366
- row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
4987
+ row_low -= row_low % GGML_CUDA_MMQ_Y;
4988
+
4989
+ if (id == g_device_count - 1) {
4990
+ row_high = nrows0;
4991
+ } else {
4992
+ row_high = nrows0*g_tensor_split[id + 1];
4993
+ row_high -= row_high % GGML_CUDA_MMQ_Y;
4994
+ }
3367
4995
  } else {
3368
4996
  row_low = 0;
3369
4997
  row_high = nrows0*i02_divisor;
@@ -3529,13 +5157,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3529
5157
  if (split) {
3530
5158
  // src0 = weight matrix is saved as a transposed matrix for better memory layout.
3531
5159
  // dst is NOT transposed.
3532
- // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
5160
+ // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
3533
5161
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
3534
5162
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
3535
- for (int64_t j = 0; j < ne1; ++j) {
3536
- float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
3537
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
3538
- }
5163
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
5164
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
5165
+ i01_diff*sizeof(float), ne1, kind, cudaStream_main));
3539
5166
  } else {
3540
5167
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
3541
5168
  CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
@@ -3718,7 +5345,18 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
3718
5345
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
3719
5346
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
3720
5347
  } else {
3721
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
5348
+ int min_compute_capability = INT_MAX;
5349
+ for (int id = 0; id < g_device_count; ++id) {
5350
+ if (min_compute_capability > g_compute_capabilities[id]) {
5351
+ min_compute_capability = g_compute_capabilities[id];
5352
+ }
5353
+ }
5354
+
5355
+ if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
5356
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
5357
+ } else {
5358
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
5359
+ }
3722
5360
  }
3723
5361
  } else {
3724
5362
  GGML_ASSERT(false);
@@ -3795,7 +5433,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
3795
5433
 
3796
5434
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3797
5435
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3798
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
5436
+
5437
+ const int mode = ((int32_t *) dst->op_params)[2];
5438
+ const bool is_glm = mode & 4;
5439
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
3799
5440
  }
3800
5441
 
3801
5442
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3828,7 +5469,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3828
5469
  row_high = nrows;
3829
5470
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
3830
5471
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
3831
- row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
5472
+ row_low -= row_low % GGML_CUDA_MMQ_Y;
5473
+
5474
+ if (id == g_device_count - 1) {
5475
+ row_high = nrows;
5476
+ } else {
5477
+ row_high = nrows*g_tensor_split[id + 1];
5478
+ row_high -= row_high % GGML_CUDA_MMQ_Y;
5479
+ }
3832
5480
  } else {
3833
5481
  GGML_ASSERT(false);
3834
5482
  }
@@ -4002,6 +5650,10 @@ void ggml_cuda_set_main_device(int main_device) {
4002
5650
  }
4003
5651
  }
4004
5652
 
5653
+ void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
5654
+ g_mul_mat_q = mul_mat_q;
5655
+ }
5656
+
4005
5657
  void ggml_cuda_set_scratch_size(size_t scratch_size) {
4006
5658
  g_scratch_size = scratch_size;
4007
5659
  }