llama_cpp 0.3.5 → 0.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,6 +14,7 @@
14
14
  #include "ggml.h"
15
15
 
16
16
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+ #define CC_TURING 700
17
18
 
18
19
  #if defined(_MSC_VER)
19
20
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -52,13 +53,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
52
53
  } while (0)
53
54
  #endif // CUDART_VERSION >= 11
54
55
 
55
- #ifdef GGML_CUDA_DMMV_F16
56
+ #ifdef GGML_CUDA_F16
56
57
  typedef half dfloat; // dequantize float
57
58
  typedef half2 dfloat2;
58
59
  #else
59
60
  typedef float dfloat; // dequantize float
60
61
  typedef float2 dfloat2;
61
- #endif //GGML_CUDA_DMMV_F16
62
+ #endif //GGML_CUDA_F16
63
+
64
+ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
65
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
66
+
67
+ int x32 = 0;
68
+ x32 |= x16[0] << 0;
69
+ x32 |= x16[1] << 16;
70
+
71
+ return x32;
72
+ }
73
+
74
+ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
75
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
76
+
77
+ int x32 = 0;
78
+ x32 |= x16[0] << 0;
79
+ x32 |= x16[1] << 16;
80
+
81
+ return x32;
82
+ }
83
+
84
+ static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
85
+ return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
86
+ }
87
+
88
+ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
89
+ return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
90
+ }
62
91
 
63
92
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
64
93
  typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
@@ -87,8 +116,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
87
116
  #define QR4_1 2
88
117
  #define QI4_1 (QK4_1 / (4 * QR4_1))
89
118
  typedef struct {
90
- half d; // delta
91
- half m; // min
119
+ half2 dm; // dm.x = delta, dm.y = min
92
120
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
93
121
  } block_q4_1;
94
122
  static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -107,8 +135,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
107
135
  #define QR5_1 2
108
136
  #define QI5_1 (QK5_1 / (4 * QR5_1))
109
137
  typedef struct {
110
- half d; // delta
111
- half m; // min
138
+ half2 dm; // dm.x = delta, dm.y = min
112
139
  uint8_t qh[4]; // 5-th bit of quants
113
140
  uint8_t qs[QK5_1 / 2]; // nibbles / quants
114
141
  } block_q5_1;
@@ -127,13 +154,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
127
154
  #define QR8_1 1
128
155
  #define QI8_1 (QK8_1 / (4 * QR8_1))
129
156
  typedef struct {
130
- half d; // delta
131
- half s; // unquantized sum
157
+ half2 ds; // ds.x = delta, ds.y = sum
132
158
  int8_t qs[QK8_0]; // quants
133
159
  } block_q8_1;
134
160
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
135
161
 
136
- typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
162
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
163
+ typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
164
+ typedef void (*load_tiles_cuda_t)(
165
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
166
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
167
+ typedef float (*vec_dot_q_mul_mat_cuda_t)(
168
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
169
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
137
170
 
138
171
  //================================= k-quants
139
172
 
@@ -150,8 +183,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
150
183
  typedef struct {
151
184
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
152
185
  uint8_t qs[QK_K/4]; // quants
153
- half d; // super-block scale for quantized scales
154
- half dmin; // super-block scale for quantized mins
186
+ half2 dm; // super-block scale for quantized scales/mins
155
187
  } block_q2_K;
156
188
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
157
189
 
@@ -180,8 +212,7 @@ typedef struct {
180
212
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
181
213
  #else
182
214
  typedef struct {
183
- half d; // super-block scale for quantized scales
184
- half dmin; // super-block scale for quantized mins
215
+ half2 dm; // super-block scale for quantized scales/mins
185
216
  uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
186
217
  uint8_t qs[QK_K/2]; // 4--bit quants
187
218
  } block_q4_K;
@@ -200,11 +231,10 @@ typedef struct {
200
231
  static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
201
232
  #else
202
233
  typedef struct {
203
- half d; // super-block scale for quantized scales
204
- half dmin; // super-block scale for quantized mins
205
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
206
- uint8_t qh[QK_K/8]; // quants, high bit
207
- uint8_t qs[QK_K/2]; // quants, low 4 bits
234
+ half2 dm; // super-block scale for quantized scales/mins
235
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
236
+ uint8_t qh[QK_K/8]; // quants, high bit
237
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
208
238
  } block_q5_K;
209
239
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
210
240
  #endif
@@ -252,6 +282,20 @@ struct ggml_tensor_extra_gpu {
252
282
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
253
283
  };
254
284
 
285
+ static int g_device_count = -1;
286
+ static int g_main_device = 0;
287
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
+ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
+ static bool g_mul_mat_q = false;
290
+
291
+ static void * g_scratch_buffer = nullptr;
292
+ static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
293
+ static size_t g_scratch_offset = 0;
294
+
295
+ static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
296
+
297
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
298
+
255
299
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
256
300
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
257
301
 
@@ -367,33 +411,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
367
411
  v.x = vui & 0xF;
368
412
  v.y = vui >> 4;
369
413
 
370
- #ifdef GGML_CUDA_DMMV_F16
414
+ #ifdef GGML_CUDA_F16
371
415
  v = __hsub2(v, {8.0f, 8.0f});
372
416
  v = __hmul2(v, {d, d});
373
417
  #else
374
418
  v.x = (v.x - 8.0f) * d;
375
419
  v.y = (v.y - 8.0f) * d;
376
- #endif // GGML_CUDA_DMMV_F16
420
+ #endif // GGML_CUDA_F16
377
421
  }
378
422
 
379
423
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
380
424
  const block_q4_1 * x = (const block_q4_1 *) vx;
381
425
 
382
- const dfloat d = x[ib].d;
383
- const dfloat m = x[ib].m;
426
+ const dfloat d = x[ib].dm.x;
427
+ const dfloat m = x[ib].dm.y;
384
428
 
385
429
  const int vui = x[ib].qs[iqs];
386
430
 
387
431
  v.x = vui & 0xF;
388
432
  v.y = vui >> 4;
389
433
 
390
- #ifdef GGML_CUDA_DMMV_F16
434
+ #ifdef GGML_CUDA_F16
391
435
  v = __hmul2(v, {d, d});
392
436
  v = __hadd2(v, {m, m});
393
437
  #else
394
438
  v.x = (v.x * d) + m;
395
439
  v.y = (v.y * d) + m;
396
- #endif // GGML_CUDA_DMMV_F16
440
+ #endif // GGML_CUDA_F16
397
441
  }
398
442
 
399
443
  static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
@@ -410,20 +454,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
410
454
  v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
411
455
  v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
412
456
 
413
- #ifdef GGML_CUDA_DMMV_F16
457
+ #ifdef GGML_CUDA_F16
414
458
  v = __hsub2(v, {16.0f, 16.0f});
415
459
  v = __hmul2(v, {d, d});
416
460
  #else
417
461
  v.x = (v.x - 16.0f) * d;
418
462
  v.y = (v.y - 16.0f) * d;
419
- #endif // GGML_CUDA_DMMV_F16
463
+ #endif // GGML_CUDA_F16
420
464
  }
421
465
 
422
466
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
423
467
  const block_q5_1 * x = (const block_q5_1 *) vx;
424
468
 
425
- const dfloat d = x[ib].d;
426
- const dfloat m = x[ib].m;
469
+ const dfloat d = x[ib].dm.x;
470
+ const dfloat m = x[ib].dm.y;
427
471
 
428
472
  uint32_t qh;
429
473
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -434,13 +478,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
434
478
  v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
435
479
  v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
436
480
 
437
- #ifdef GGML_CUDA_DMMV_F16
481
+ #ifdef GGML_CUDA_F16
438
482
  v = __hmul2(v, {d, d});
439
483
  v = __hadd2(v, {m, m});
440
484
  #else
441
485
  v.x = (v.x * d) + m;
442
486
  v.y = (v.y * d) + m;
443
- #endif // GGML_CUDA_DMMV_F16
487
+ #endif // GGML_CUDA_F16
444
488
  }
445
489
 
446
490
  static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
@@ -451,12 +495,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
451
495
  v.x = x[ib].qs[iqs + 0];
452
496
  v.y = x[ib].qs[iqs + 1];
453
497
 
454
- #ifdef GGML_CUDA_DMMV_F16
498
+ #ifdef GGML_CUDA_F16
455
499
  v = __hmul2(v, {d, d});
456
500
  #else
457
501
  v.x *= d;
458
502
  v.y *= d;
459
- #endif // GGML_CUDA_DMMV_F16
503
+ #endif // GGML_CUDA_F16
460
504
  }
461
505
 
462
506
  //================================== k-quants
@@ -475,8 +519,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
475
519
  const uint8_t q = x[i].qs[32*n + l];
476
520
  float * y = yy + i*QK_K + 128*n;
477
521
 
478
- float dall = x[i].d;
479
- float dmin = x[i].dmin;
522
+ float dall = x[i].dm.x;
523
+ float dmin = x[i].dm.y;
480
524
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
481
525
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
482
526
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -486,8 +530,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
486
530
  const int il = tid%16; // 0...15
487
531
  const uint8_t q = x[i].qs[il] >> (2*is);
488
532
  float * y = yy + i*QK_K + 16*is + il;
489
- float dall = x[i].d;
490
- float dmin = x[i].dmin;
533
+ float dall = x[i].dm.x;
534
+ float dmin = x[i].dm.y;
491
535
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
492
536
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
493
537
  #endif
@@ -573,8 +617,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
573
617
 
574
618
  float * y = yy + i*QK_K + 64*il + n*ir;
575
619
 
576
- const float dall = x[i].d;
577
- const float dmin = x[i].dmin;
620
+ const float dall = x[i].dm.x;
621
+ const float dmin = x[i].dm.y;
578
622
 
579
623
  const uint8_t * q = x[i].qs + 32*il + n*ir;
580
624
 
@@ -612,8 +656,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
612
656
 
613
657
  float * y = yy + i*QK_K + 64*il + 2*ir;
614
658
 
615
- const float dall = x[i].d;
616
- const float dmin = x[i].dmin;
659
+ const float dall = x[i].dm.x;
660
+ const float dmin = x[i].dm.y;
617
661
 
618
662
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
619
663
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -725,8 +769,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
725
769
  const float * y = yy + i * QK_K + y_offset;
726
770
  const uint8_t * q = x[i].qs + q_offset;
727
771
 
728
- const float dall = x[i].d;
729
- const float dmin = x[i].dmin;
772
+ const float dall = x[i].dm.x;
773
+ const float dmin = x[i].dm.y;
730
774
 
731
775
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
732
776
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -768,9 +812,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
768
812
  uaux[0] = s[0] & 0x0f0f0f0f;
769
813
  uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
770
814
 
771
- const half2 * dh = (const half2 *)&x[i].d;
772
-
773
- const float2 dall = __half22float2(dh[0]);
815
+ const float2 dall = __half22float2(x[i].dm);
774
816
 
775
817
  float sum1 = 0, sum2 = 0;
776
818
  for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
@@ -948,8 +990,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
948
990
  const float * y1 = yy + i*QK_K + y_offset;
949
991
  const float * y2 = y1 + 128;
950
992
 
951
- const float dall = x[i].d;
952
- const float dmin = x[i].dmin;
993
+ const float dall = x[i].dm.x;
994
+ const float dmin = x[i].dm.y;
953
995
 
954
996
  const uint16_t * a = (const uint16_t *)x[i].scales;
955
997
  aux[0] = a[im+0] & kmask1;
@@ -1081,8 +1123,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1081
1123
  const float * y1 = yy + i*QK_K + y_offset;
1082
1124
  const float * y2 = y1 + 128;
1083
1125
 
1084
- const float dall = x[i].d;
1085
- const float dmin = x[i].dmin;
1126
+ const float dall = x[i].dm.x;
1127
+ const float dmin = x[i].dm.y;
1086
1128
 
1087
1129
  const uint16_t * a = (const uint16_t *)x[i].scales;
1088
1130
  aux[0] = a[im+0] & kmask1;
@@ -1270,19 +1312,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1270
1312
  v.y = x[ib + iqs + 1];
1271
1313
  }
1272
1314
 
1273
- static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1274
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
1315
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1316
+ const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1275
1317
 
1276
- if (i >= k) {
1318
+ if (ix >= kx_padded) {
1277
1319
  return;
1278
1320
  }
1279
1321
 
1322
+ const int iy = blockDim.y*blockIdx.y + threadIdx.y;
1323
+
1324
+ const int i_padded = iy*kx_padded + ix;
1325
+
1280
1326
  block_q8_1 * y = (block_q8_1 *) vy;
1281
1327
 
1282
- const int ib = i / QK8_1; // block index
1283
- const int iqs = i % QK8_1; // quant index
1328
+ const int ib = i_padded / QK8_1; // block index
1329
+ const int iqs = i_padded % QK8_1; // quant index
1284
1330
 
1285
- const float xi = i < ndata ? x[i] : 0.0f;
1331
+ const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
1286
1332
  float amax = fabsf(xi);
1287
1333
  float sum = xi;
1288
1334
 
@@ -1301,8 +1347,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1301
1347
  return;
1302
1348
  }
1303
1349
 
1304
- y[ib].d = d;
1305
- y[ib].s = sum;
1350
+ y[ib].ds.x = d;
1351
+ y[ib].ds.y = sum;
1306
1352
  }
1307
1353
 
1308
1354
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -1326,485 +1372,1876 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
1326
1372
  y[iybs + iqs + y_offset] = v.y;
1327
1373
  }
1328
1374
 
1329
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1330
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1331
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1332
- const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1375
+ // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
1376
+ // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
1377
+
1378
+ #define VDR_Q4_0_Q8_1_MMVQ 2
1379
+ #define VDR_Q4_0_Q8_1_MMQ 4
1380
+
1381
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
1382
+ const int * v, const int * u, const float & d4, const half2 & ds8) {
1333
1383
 
1334
- int vi;
1335
- memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1336
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1337
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1384
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1385
+ int sumi = 0;
1338
1386
 
1339
- const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
1387
+ #pragma unroll
1388
+ for (int i = 0; i < vdr; ++i) {
1389
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
1390
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
1340
1391
 
1341
- // subtract 8 from each quantized value
1342
- const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
1343
- const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
1392
+ // SIMD dot product of quantized values
1393
+ sumi = __dp4a(vi0, u[2*i+0], sumi);
1394
+ sumi = __dp4a(vi1, u[2*i+1], sumi);
1395
+ }
1344
1396
 
1345
- // SIMD dot product of quantized values
1346
- int sumi = __dp4a(vi0, ui0, 0);
1347
- sumi = __dp4a(vi1, ui1, sumi);
1397
+ const float2 ds8f = __half22float2(ds8);
1348
1398
 
1349
- return sumi*d;
1399
+ // second part effectively subtracts 8 from each quant value
1400
+ return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1350
1401
  #else
1351
1402
  return 0.0f; // only to satisfy the compiler
1352
1403
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1353
1404
  }
1354
1405
 
1355
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1356
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1357
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1358
- const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1406
+ #define VDR_Q4_1_Q8_1_MMVQ 2
1407
+ #define VDR_Q4_1_Q8_1_MMQ 4
1359
1408
 
1360
- const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1361
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1362
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
1409
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
1410
+ const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
1363
1411
 
1364
- const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
1365
- const float m = bq4_1->m;
1366
- const float s = bq8_1->s;
1412
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1413
+ int sumi = 0;
1367
1414
 
1368
- const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1369
- const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1415
+ #pragma unroll
1416
+ for (int i = 0; i < vdr; ++i) {
1417
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
1418
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
1370
1419
 
1371
- // SIMD dot product of quantized values
1372
- int sumi = __dp4a(vi0, ui0, 0);
1373
- sumi = __dp4a(vi1, ui1, sumi);
1420
+ // SIMD dot product of quantized values
1421
+ sumi = __dp4a(vi0, u[2*i+0], sumi);
1422
+ sumi = __dp4a(vi1, u[2*i+1], sumi);
1423
+ }
1374
1424
 
1375
- return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1425
+ #ifdef GGML_CUDA_F16
1426
+ const float2 tmp = __half22float2(__hmul2(dm4, ds8));
1427
+ const float d4d8 = tmp.x;
1428
+ const float m4s8 = tmp.y;
1429
+ #else
1430
+ const float2 dm4f = __half22float2(dm4);
1431
+ const float2 ds8f = __half22float2(ds8);
1432
+ const float d4d8 = dm4f.x * ds8f.x;
1433
+ const float m4s8 = dm4f.y * ds8f.y;
1434
+ #endif // GGML_CUDA_F16
1435
+
1436
+ // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1437
+ return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1376
1438
  #else
1377
1439
  return 0.0f; // only to satisfy the compiler
1378
1440
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1379
1441
  }
1380
1442
 
1381
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1382
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1443
+ #define VDR_Q5_0_Q8_1_MMVQ 2
1444
+ #define VDR_Q5_0_Q8_1_MMQ 4
1445
+
1446
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
1447
+ const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
1448
+
1383
1449
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1384
- const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1450
+ int sumi = 0;
1451
+
1452
+ #pragma unroll
1453
+ for (int i = 0; i < vdr; ++i) {
1454
+ int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1455
+ vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
1456
+ vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
1457
+ vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
1458
+ vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
1459
+ sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
1460
+
1461
+ int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
1462
+ vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
1463
+ vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
1464
+ vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
1465
+ vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
1466
+ sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1467
+ }
1468
+
1469
+ const float2 ds8f = __half22float2(ds8);
1385
1470
 
1386
- int qs;
1387
- memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1388
- const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1389
- const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1390
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1391
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
1392
-
1393
- const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
1394
-
1395
- int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1396
- vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1397
- vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1398
- vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1399
- vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1400
- vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
1401
- int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1402
-
1403
- int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1404
- vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1405
- vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1406
- vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1407
- vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1408
- vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
1409
- sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1410
-
1411
- return sumi*d;
1471
+ // second part effectively subtracts 16 from each quant value
1472
+ return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1412
1473
  #else
1413
1474
  return 0.0f; // only to satisfy the compiler
1414
1475
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1415
1476
  }
1416
1477
 
1417
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1418
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1478
+ #define VDR_Q5_1_Q8_1_MMVQ 2
1479
+ #define VDR_Q5_1_Q8_1_MMQ 4
1480
+
1481
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
1482
+ const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
1483
+
1419
1484
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1420
- const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1485
+ int sumi = 0;
1486
+
1487
+ #pragma unroll
1488
+ for (int i = 0; i < vdr; ++i) {
1489
+ int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1490
+ vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
1491
+ vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
1492
+ vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
1493
+ vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
1494
+ sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
1495
+
1496
+ int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
1497
+ vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
1498
+ vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
1499
+ vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
1500
+ vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
1501
+ sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1502
+ }
1503
+
1504
+ #ifdef GGML_CUDA_F16
1505
+ const float2 tmp = __half22float2(__hmul2(dm5, ds8));
1506
+ const float d5d8 = tmp.x;
1507
+ const float m5s8 = tmp.y;
1508
+ #else
1509
+ const float2 dm5f = __half22float2(dm5);
1510
+ const float2 ds8f = __half22float2(ds8);
1511
+ const float d5d8 = dm5f.x * ds8f.x;
1512
+ const float m5s8 = dm5f.y * ds8f.y;
1513
+ #endif // GGML_CUDA_F16
1514
+
1515
+ // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
1516
+ return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1421
1517
 
1422
- const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
1423
- const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
1424
- const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
1425
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1426
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
1427
-
1428
- const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
1429
- const float m = bq5_1->m;
1430
- const float s = bq8_1->s;
1431
-
1432
- int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1433
- vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1434
- vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1435
- vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1436
- vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1437
- int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1438
-
1439
- int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1440
- vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1441
- vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1442
- vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1443
- vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1444
- sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1445
-
1446
- return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1447
1518
  #else
1448
1519
  return 0.0f; // only to satisfy the compiler
1449
1520
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1450
1521
  }
1451
1522
 
1452
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1453
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1454
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1455
- const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1523
+ #define VDR_Q8_0_Q8_1_MMVQ 2
1524
+ #define VDR_Q8_0_Q8_1_MMQ 8
1456
1525
 
1457
- int vi;
1458
- memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1459
- const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1526
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
1527
+ const int * v, const int * u, const float & d8_0, const float & d8_1) {
1460
1528
 
1461
- const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
1529
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1530
+ int sumi = 0;
1462
1531
 
1463
- // SIMD dot product of quantized values
1464
- int sumi = __dp4a(vi, ui, 0);
1532
+ #pragma unroll
1533
+ for (int i = 0; i < vdr; ++i) {
1534
+ // SIMD dot product of quantized values
1535
+ sumi = __dp4a(v[i], u[i], sumi);
1536
+ }
1465
1537
 
1466
- return sumi*d;
1538
+ return d8_0*d8_1 * sumi;
1467
1539
  #else
1468
1540
  return 0.0f; // only to satisfy the compiler
1469
1541
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1470
1542
  }
1471
1543
 
1472
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1473
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1544
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
1545
+ const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
1474
1546
 
1475
1547
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1476
- const block_q2_K * bq2_K = (const block_q2_K *) vbq;
1548
+ int sumi = 0;
1477
1549
 
1478
- const int bq8_offset = QR2_K * (iqs / QI8_1);
1479
- const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1550
+ #pragma unroll
1551
+ for (int i = 0; i < vdr; ++i) {
1552
+ // SIMD dot product of quantized values
1553
+ sumi = __dp4a(v[i], u[i], sumi);
1554
+ }
1480
1555
 
1481
- float sumf_d = 0.0f;
1482
- float sumf_m = 0.0f;
1556
+ #ifdef GGML_CUDA_F16
1557
+ const float2 tmp = __half22float2(__hmul2(dm8, ds8));
1558
+ const float d8d8 = tmp.x;
1559
+ const float m8s8 = tmp.y;
1560
+ #else
1561
+ const float2 dm8f = __half22float2(dm8);
1562
+ const float2 ds8f = __half22float2(ds8);
1563
+ const float d8d8 = dm8f.x * ds8f.x;
1564
+ const float m8s8 = dm8f.y * ds8f.y;
1565
+ #endif // GGML_CUDA_F16
1566
+
1567
+ // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1568
+ return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1569
+ #else
1570
+ return 0.0f; // only to satisfy the compiler
1571
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1572
+ }
1483
1573
 
1484
- const float d = bq2_K->d;
1485
- const float dmin = bq2_K->dmin;
1574
+ #define VDR_Q2_K_Q8_1_MMVQ 1
1575
+ #define VDR_Q2_K_Q8_1_MMQ 2
1486
1576
 
1487
- const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
1577
+ // contiguous v/x values
1578
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1579
+ const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1580
+ const half2 & dm2, const float * __restrict__ d8) {
1488
1581
 
1489
- for (int i = 0; i < QR2_K; ++i) {
1490
- const int sc = bq2_K->scales[scale_offset + 2*i];
1582
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1583
+ float sumf_d = 0.0f;
1584
+ float sumf_m = 0.0f;
1491
1585
 
1492
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1493
- const float d8i = bq8i->d;
1586
+ #pragma unroll
1587
+ for (int i = 0; i < QR2_K; ++i) {
1588
+ const int sc = scales[2*i];
1494
1589
 
1495
1590
  const int vi = (v >> (2*i)) & 0x03030303;
1496
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1497
1591
 
1498
- sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
1499
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
1592
+ sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
1593
+
1594
+ // fill int with 4x m
1595
+ int m = sc >> 4;
1596
+ m |= m << 8;
1597
+ m |= m << 16;
1598
+ sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
1500
1599
  }
1501
1600
 
1502
- return d*sumf_d - dmin*sumf_m;
1601
+ const float2 dm2f = __half22float2(dm2);
1602
+
1603
+ return dm2f.x*sumf_d - dm2f.y*sumf_m;
1503
1604
  #else
1504
1605
  return 0.0f; // only to satisfy the compiler
1505
1606
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1506
1607
  }
1507
1608
 
1508
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1509
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1609
+ // contiguous u/y values
1610
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
1611
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1612
+ const half2 & dm2, const float & d8) {
1510
1613
 
1511
1614
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1512
- const block_q3_K * bq3_K = (const block_q3_K *) vbq;
1615
+ int sumi_d = 0;
1616
+ int sumi_m = 0;
1513
1617
 
1514
- const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
1515
- const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1618
+ #pragma unroll
1619
+ for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
1620
+ int sumi_d_sc = 0;
1516
1621
 
1517
- float sumf = 0.0f;
1622
+ const int sc = scales[i0 / (QI8_1/2)];
1518
1623
 
1519
- const float d = bq3_K->d;
1624
+ // fill int with 4x m
1625
+ int m = sc >> 4;
1626
+ m |= m << 8;
1627
+ m |= m << 16;
1628
+
1629
+ #pragma unroll
1630
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1631
+ sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
1632
+ sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
1633
+ }
1634
+
1635
+ sumi_d += sumi_d_sc * (sc & 0xF);
1636
+ }
1637
+
1638
+ const float2 dm2f = __half22float2(dm2);
1639
+
1640
+ return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
1641
+ #else
1642
+ return 0.0f; // only to satisfy the compiler
1643
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1644
+ }
1520
1645
 
1521
- int vl;
1522
- memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
1646
+ #define VDR_Q3_K_Q8_1_MMVQ 1
1647
+ #define VDR_Q3_K_Q8_1_MMQ 2
1523
1648
 
1524
- int vh;
1525
- memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
1526
- vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
1527
- vh >>= bq8_offset;
1649
+ // contiguous v/x values
1650
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
1651
+ const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1652
+ const int & scale_offset, const float & d3, const float * __restrict__ d8) {
1653
+
1654
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1655
+ float sumf = 0.0f;
1528
1656
 
1657
+ #pragma unroll
1529
1658
  for (int i = 0; i < QR3_K; ++i) {
1530
1659
  const int isc = scale_offset + 2*i;
1531
1660
 
1532
1661
  const int isc_low = isc % (QK_K/32);
1533
1662
  const int sc_shift_low = 4 * (isc / (QK_K/32));
1534
- const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
1663
+ const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
1535
1664
 
1536
1665
  const int isc_high = isc % (QK_K/64);
1537
1666
  const int sc_shift_high = 2 * (isc / (QK_K/64));
1538
- const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1667
+ const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1539
1668
 
1540
1669
  const int sc = (sc_low | sc_high) - 32;
1541
1670
 
1542
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1543
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1544
- const float d8i = bq8i->d;
1545
-
1546
1671
  const int vil = (vl >> (2*i)) & 0x03030303;
1547
1672
 
1548
1673
  const int vih = ((vh >> i) << 2) & 0x04040404;
1549
1674
 
1550
1675
  const int vi = __vsubss4(vil, vih);
1551
1676
 
1552
- sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1677
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1553
1678
  }
1554
1679
 
1555
- return d*sumf;
1680
+ return d3 * sumf;
1556
1681
  #else
1557
1682
  return 0.0f; // only to satisfy the compiler
1558
1683
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1559
1684
  }
1560
1685
 
1561
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1562
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1686
+ // contiguous u/y values
1687
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
1688
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
1689
+ const float & d3, const float & d8) {
1563
1690
 
1564
1691
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1565
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1692
+ int sumi = 0;
1566
1693
 
1567
- float sumf_d = 0.0f;
1568
- float sumf_m = 0.0f;
1694
+ #pragma unroll
1695
+ for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
1696
+ int sumi_sc = 0;
1569
1697
 
1570
- #ifndef GGML_QKK_64
1698
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1699
+ sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
1700
+ }
1571
1701
 
1572
- // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1573
- const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
1702
+ sumi += sumi_sc * scales[i0 / (QI8_1/2)];
1703
+ }
1574
1704
 
1575
- const float d = bq4_K->d;
1576
- const float dmin = bq4_K->dmin;
1705
+ return d3*d8 * sumi;
1706
+ #else
1707
+ return 0.0f; // only to satisfy the compiler
1708
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1709
+ }
1577
1710
 
1578
- // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
1579
- // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
1580
- // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
1581
- // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
1711
+ #define VDR_Q4_K_Q8_1_MMVQ 2
1712
+ #define VDR_Q4_K_Q8_1_MMQ 8
1582
1713
 
1583
- const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1584
- const int v1 = q4[0];
1585
- const int v2 = q4[4];
1714
+ // contiguous v/x values
1715
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
1716
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1717
+ const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
1586
1718
 
1587
- const uint16_t * scales = (const uint16_t *)bq4_K->scales;
1588
- uint16_t aux[2];
1589
- const int j = bq8_offset/2;
1590
- if (j < 2) {
1591
- aux[0] = scales[j+0] & 0x3f3f;
1592
- aux[1] = scales[j+2] & 0x3f3f;
1593
- } else {
1594
- aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1595
- aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1596
- }
1597
- const uint8_t * sc = (const uint8_t *)aux;
1598
- const uint8_t * m = sc + 2;
1719
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1720
+ float sumf_d = 0.0f;
1721
+ float sumf_m = 0.0f;
1599
1722
 
1723
+ #pragma unroll
1600
1724
  for (int i = 0; i < QR4_K; ++i) {
1725
+ const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
1726
+ const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
1601
1727
 
1602
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1603
- const float d8i = bq8i->d;
1604
- const int * q8 = (const int *)bq8i->qs + (iqs%4);
1605
- const int ui1 = q8[0];
1606
- const int ui2 = q8[4];
1607
-
1608
- const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
1609
- const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
1610
-
1611
- const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1612
- const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1728
+ const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
1729
+ const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
1613
1730
 
1614
- sumf_d += d8i * (dot1 * sc[i]);
1615
- sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1731
+ sumf_d += d8[i] * (dot1 * sc[i]);
1732
+ sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1616
1733
  }
1617
1734
 
1618
- return d*sumf_d - dmin*sumf_m;
1619
-
1620
- #else
1735
+ const float2 dm4f = __half22float2(dm4);
1621
1736
 
1622
- uint16_t aux16[2];
1623
- const uint8_t * s = (const uint8_t *)aux16;
1737
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1624
1738
 
1625
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
1626
- aux16[0] = a[0] & 0x0f0f;
1627
- aux16[1] = (a[0] >> 4) & 0x0f0f;
1739
+ #else
1740
+ return 0.0f; // only to satisfy the compiler
1741
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1742
+ }
1628
1743
 
1629
- const float dall = bq4_K->d[0];
1630
- const float dmin = bq4_K->d[1];
1744
+ // contiguous u/y values
1745
+ // also used for q5_K
1746
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1747
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1748
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1631
1749
 
1632
- const float d8_1 = bq8_1[0].d;
1633
- const float d8_2 = bq8_1[1].d;
1750
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1751
+ float sumf_d = 0.0f;
1752
+ float sumf_m = 0.0f;
1634
1753
 
1635
- const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1636
- const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1637
- const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1638
- const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1754
+ #pragma unroll
1755
+ for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
1756
+ int sumi_d = 0;
1639
1757
 
1640
- const int * q4 = (const int *)bq4_K->qs + iqs;
1641
- const int v1 = q4[0];
1642
- const int v2 = q4[4];
1758
+ #pragma unroll
1759
+ for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
1760
+ sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
1761
+ sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
1762
+ }
1643
1763
 
1644
- const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
1645
- const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
1646
- const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1647
- const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
1764
+ const float2 ds8f = __half22float2(ds8[i0 / 4]);
1648
1765
 
1649
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
1650
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
1766
+ sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
1767
+ sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
1768
+ }
1651
1769
 
1652
- return dall * sumf_d - dmin * sumf_m;
1770
+ const float2 dm4f = __half22float2(dm4);
1653
1771
 
1654
- #endif
1772
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1655
1773
 
1656
1774
  #else
1657
1775
  return 0.0f; // only to satisfy the compiler
1658
1776
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1659
1777
  }
1660
1778
 
1661
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1662
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1663
-
1664
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1665
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1666
-
1667
- #ifndef GGML_QKK_64
1779
+ #define VDR_Q5_K_Q8_1_MMVQ 2
1780
+ #define VDR_Q5_K_Q8_1_MMQ 8
1668
1781
 
1669
- const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
1670
- const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1671
- const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
1782
+ // contiguous v/x values
1783
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1784
+ const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1785
+ const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
1672
1786
 
1787
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1673
1788
  float sumf_d = 0.0f;
1674
1789
  float sumf_m = 0.0f;
1675
1790
 
1676
- const float d = bq5_K->d;
1677
- const float dmin = bq5_K->dmin;
1791
+ #pragma unroll
1792
+ for (int i = 0; i < QR5_K; ++i) {
1793
+ const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
1794
+ const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
1678
1795
 
1679
- const int vl1 = ql[0];
1680
- const int vl2 = ql[4];
1796
+ const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
1797
+ const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
1681
1798
 
1682
- const int vh1 = qh[0] >> bq8_offset;
1683
- const int vh2 = qh[4] >> bq8_offset;
1799
+ const int v0i = vl0i | vh0i;
1800
+ const int v1i = vl1i | vh1i;
1684
1801
 
1685
- const uint16_t * scales = (const uint16_t *)bq5_K->scales;
1686
- uint16_t aux[2];
1687
- const int j = bq8_offset/2;
1688
- if (j < 2) {
1689
- aux[0] = scales[j+0] & 0x3f3f;
1690
- aux[1] = scales[j+2] & 0x3f3f;
1691
- } else {
1692
- aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1693
- aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1694
- }
1695
- const uint8_t * sc = (const uint8_t *)aux;
1696
- const uint8_t * m = sc + 2;
1802
+ const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
1803
+ const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
1697
1804
 
1698
- for (int i = 0; i < QR5_K; ++i) {
1805
+ sumf_d += d8[i] * (dot1 * sc[i]);
1806
+ sumf_m += d8[i] * (dot2 * m[i]);
1699
1807
 
1700
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1701
- const float d8i = bq8i->d;
1702
- const int * q8 = (const int *)bq8i->qs + (iqs%4);
1703
- const int ui1 = q8[0];
1704
- const int ui2 = q8[4];
1808
+ }
1705
1809
 
1706
- const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
1707
- const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
1810
+ const float2 dm5f = __half22float2(dm5);
1708
1811
 
1709
- const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
1710
- const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
1812
+ return dm5f.x*sumf_d - dm5f.y*sumf_m;
1711
1813
 
1712
- const int vi1 = vil1 | vih1;
1713
- const int vi2 = vil2 | vih2;
1814
+ #else
1815
+ return 0.0f; // only to satisfy the compiler
1816
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1817
+ }
1714
1818
 
1715
- const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1716
- const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1819
+ #define VDR_Q6_K_Q8_1_MMVQ 1
1820
+ #define VDR_Q6_K_Q8_1_MMQ 8
1717
1821
 
1718
- sumf_d += d8i * (dot1 * sc[i]);
1719
- sumf_m += d8i * (dot2 * m[i]);
1822
+ // contiguous v/x values
1823
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
1824
+ const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
1825
+ const float & d, const float * __restrict__ d8) {
1720
1826
 
1721
- }
1827
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1828
+ float sumf = 0.0f;
1722
1829
 
1723
- return d*sumf_d - dmin*sumf_m;
1830
+ #pragma unroll
1831
+ for (int i = 0; i < QR6_K; ++i) {
1832
+ const int sc = scales[4*i];
1724
1833
 
1725
- #else
1834
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1726
1835
 
1727
- const int8_t * s = bq5_K->scales;
1836
+ const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
1728
1837
 
1729
- const float d = bq5_K->d;
1838
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1730
1839
 
1731
- const float d8_1 = bq8_1[0].d;
1732
- const float d8_2 = bq8_1[1].d;
1840
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1841
+ }
1733
1842
 
1734
- const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1735
- const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1736
- const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1737
- const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1843
+ return d*sumf;
1844
+ #else
1845
+ return 0.0f; // only to satisfy the compiler
1846
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1847
+ }
1738
1848
 
1739
- const int * ql = (const int *)bq5_K->qs + iqs;
1740
- const int vl1 = ql[0];
1741
- const int vl2 = ql[4];
1849
+ // contiguous u/y values
1850
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
1851
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
1852
+ const float & d6, const float * __restrict__ d8) {
1742
1853
 
1743
- const int step = 4 * iqs; // 0, 4, 8, 12
1744
- const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
1745
- const int in = step%8; // 0, 4, 0, 4
1746
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
1854
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1855
+ float sumf_d = 0.0f;
1747
1856
 
1748
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
1749
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
1750
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
1751
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
1857
+ #pragma unroll
1858
+ for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
1859
+ int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
1752
1860
 
1753
- const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
1754
- + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
1861
+ #pragma unroll
1862
+ for (int i = i0; i < i0 + 2; ++i) {
1863
+ sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
1864
+ sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
1755
1865
 
1756
- return d * sumf_d;
1866
+ sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
1867
+ sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
1868
+ }
1757
1869
 
1758
- #endif
1870
+ sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
1871
+ }
1872
+
1873
+ return d6 * sumf_d;
1759
1874
 
1760
1875
  #else
1761
1876
  return 0.0f; // only to satisfy the compiler
1762
1877
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1763
1878
  }
1764
1879
 
1765
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1766
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1880
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1881
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1882
+
1883
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1884
+
1885
+ int v[VDR_Q4_0_Q8_1_MMVQ];
1886
+ int u[2*VDR_Q4_0_Q8_1_MMVQ];
1887
+
1888
+ #pragma unroll
1889
+ for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
1890
+ v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
1891
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1892
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
1893
+ }
1894
+
1895
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
1896
+ }
1897
+
1898
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1899
+
1900
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
1901
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
1902
+
1903
+ *x_ql = tile_x_qs;
1904
+ *x_dm = (half2 *) tile_x_d;
1905
+ }
1906
+
1907
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1908
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1909
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1910
+
1911
+ __builtin_assume(i_offset >= 0);
1912
+ __builtin_assume(i_offset < nwarps);
1913
+ __builtin_assume(k >= 0);
1914
+ __builtin_assume(k < WARP_SIZE);
1915
+
1916
+ const int kbx = k / QI4_0;
1917
+ const int kqsx = k % QI4_0;
1918
+
1919
+ const block_q4_0 * bx0 = (block_q4_0 *) vx;
1920
+
1921
+ float * x_dmf = (float *) x_dm;
1922
+
1923
+ #pragma unroll
1924
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1925
+ int i = i0 + i_offset;
1926
+
1927
+ if (need_check) {
1928
+ i = min(i, i_max);
1929
+ }
1930
+
1931
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
1932
+
1933
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
1934
+ // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1935
+ }
1936
+
1937
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1938
+ const int kbxd = k % blocks_per_tile_x_row;
1939
+
1940
+ #pragma unroll
1941
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
1942
+ int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1943
+
1944
+ if (need_check) {
1945
+ i = min(i, i_max);
1946
+ }
1947
+
1948
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1949
+
1950
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
1951
+ }
1952
+ }
1953
+
1954
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1955
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1956
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1957
+
1958
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1959
+ const float * x_dmf = (float *) x_dm;
1960
+
1961
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
1962
+
1963
+ #pragma unroll
1964
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
1965
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
1966
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
1967
+ }
1968
+
1969
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
1970
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
1971
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1972
+ }
1973
+
1974
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1975
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1976
+
1977
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1978
+
1979
+ int v[VDR_Q4_1_Q8_1_MMVQ];
1980
+ int u[2*VDR_Q4_1_Q8_1_MMVQ];
1981
+
1982
+ #pragma unroll
1983
+ for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
1984
+ v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
1985
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1986
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
1987
+ }
1988
+
1989
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
1990
+ }
1991
+
1992
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1993
+
1994
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
1995
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
1996
+
1997
+ *x_ql = tile_x_qs;
1998
+ *x_dm = tile_x_dm;
1999
+ }
2000
+
2001
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2002
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2003
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2004
+
2005
+ __builtin_assume(i_offset >= 0);
2006
+ __builtin_assume(i_offset < nwarps);
2007
+ __builtin_assume(k >= 0);
2008
+ __builtin_assume(k < WARP_SIZE);
2009
+
2010
+ const int kbx = k / QI4_1;
2011
+ const int kqsx = k % QI4_1;
2012
+
2013
+ const block_q4_1 * bx0 = (block_q4_1 *) vx;
2014
+
2015
+ #pragma unroll
2016
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2017
+ int i = i0 + i_offset;
2018
+
2019
+ if (need_check) {
2020
+ i = min(i, i_max);
2021
+ }
2022
+
2023
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
2024
+
2025
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2026
+ }
2027
+
2028
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
2029
+ const int kbxd = k % blocks_per_tile_x_row;
2030
+
2031
+ #pragma unroll
2032
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
2033
+ int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
2034
+
2035
+ if (need_check) {
2036
+ i = min(i, i_max);
2037
+ }
2038
+
2039
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
2040
+
2041
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
2042
+ }
2043
+ }
2044
+
2045
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
2046
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2047
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2048
+
2049
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2050
+
2051
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
2052
+
2053
+ #pragma unroll
2054
+ for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
2055
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2056
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
2057
+ }
2058
+
2059
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
2060
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
2061
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
2062
+ }
2063
+
2064
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
2065
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2066
+
2067
+ const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
2068
+
2069
+ int vl[VDR_Q5_0_Q8_1_MMVQ];
2070
+ int vh[VDR_Q5_0_Q8_1_MMVQ];
2071
+ int u[2*VDR_Q5_0_Q8_1_MMVQ];
2072
+
2073
+ #pragma unroll
2074
+ for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
2075
+ vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
2076
+ vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
2077
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2078
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
2079
+ }
2080
+
2081
+ return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
2082
+ }
2083
+
2084
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2085
+
2086
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2087
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
2088
+
2089
+ *x_ql = tile_x_ql;
2090
+ *x_dm = (half2 *) tile_x_d;
2091
+ }
2092
+
2093
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2094
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2095
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2096
+
2097
+ __builtin_assume(i_offset >= 0);
2098
+ __builtin_assume(i_offset < nwarps);
2099
+ __builtin_assume(k >= 0);
2100
+ __builtin_assume(k < WARP_SIZE);
2101
+
2102
+ const int kbx = k / QI5_0;
2103
+ const int kqsx = k % QI5_0;
2104
+
2105
+ const block_q5_0 * bx0 = (block_q5_0 *) vx;
2106
+
2107
+ #pragma unroll
2108
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2109
+ int i = i0 + i_offset;
2110
+
2111
+ if (need_check) {
2112
+ i = min(i, i_max);
2113
+ }
2114
+
2115
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
2116
+
2117
+ const int ql = get_int_from_uint8(bxi->qs, kqsx);
2118
+ const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
2119
+
2120
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
2121
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
2122
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
2123
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
2124
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
2125
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
2126
+
2127
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
2128
+
2129
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
2130
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
2131
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
2132
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
2133
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
2134
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
2135
+
2136
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
2137
+ }
2138
+
2139
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
2140
+ const int kbxd = k % blocks_per_tile_x_row;
2141
+ float * x_dmf = (float *) x_dm;
2142
+
2143
+ #pragma unroll
2144
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
2145
+ int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
2146
+
2147
+ if (need_check) {
2148
+ i = min(i, i_max);
2149
+ }
2150
+
2151
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2152
+
2153
+ x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
2154
+ }
2155
+ }
2156
+
2157
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
2158
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2159
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2160
+
2161
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2162
+ const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
2163
+ const float * x_dmf = (const float *) x_dm;
2164
+ const float * y_df = (const float *) y_ds;
2165
+
2166
+ int u[2*VDR_Q5_0_Q8_1_MMQ];
2167
+
2168
+ #pragma unroll
2169
+ for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
2170
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2171
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
2172
+ }
2173
+
2174
+ return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
2175
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
2176
+ }
2177
+
2178
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2179
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2180
+
2181
+ const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
2182
+
2183
+ int vl[VDR_Q5_1_Q8_1_MMVQ];
2184
+ int vh[VDR_Q5_1_Q8_1_MMVQ];
2185
+ int u[2*VDR_Q5_1_Q8_1_MMVQ];
2186
+
2187
+ #pragma unroll
2188
+ for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
2189
+ vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
2190
+ vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
2191
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2192
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
2193
+ }
2194
+
2195
+ return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
2196
+ }
2197
+
2198
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2199
+
2200
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2201
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
2202
+
2203
+ *x_ql = tile_x_ql;
2204
+ *x_dm = tile_x_dm;
2205
+ }
2206
+
2207
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2208
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2209
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2210
+
2211
+ __builtin_assume(i_offset >= 0);
2212
+ __builtin_assume(i_offset < nwarps);
2213
+ __builtin_assume(k >= 0);
2214
+ __builtin_assume(k < WARP_SIZE);
2215
+
2216
+ const int kbx = k / QI5_1;
2217
+ const int kqsx = k % QI5_1;
2218
+
2219
+ const block_q5_1 * bx0 = (block_q5_1 *) vx;
2220
+
2221
+ #pragma unroll
2222
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2223
+ int i = i0 + i_offset;
2224
+
2225
+ if (need_check) {
2226
+ i = min(i, i_max);
2227
+ }
2228
+
2229
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
2230
+
2231
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
2232
+ const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
2233
+
2234
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
2235
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
2236
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
2237
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
2238
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
2239
+
2240
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
2241
+
2242
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
2243
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
2244
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
2245
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
2246
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
2247
+
2248
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
2249
+ }
2250
+
2251
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
2252
+ const int kbxd = k % blocks_per_tile_x_row;
2253
+
2254
+ #pragma unroll
2255
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
2256
+ int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
2257
+
2258
+ if (need_check) {
2259
+ i = min(i, i_max);
2260
+ }
2261
+
2262
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
2263
+
2264
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
2265
+ }
2266
+ }
2267
+
2268
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
2269
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2270
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2271
+
2272
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2273
+ const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
2274
+
2275
+ int u[2*VDR_Q5_1_Q8_1_MMQ];
2276
+
2277
+ #pragma unroll
2278
+ for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
2279
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2280
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
2281
+ }
2282
+
2283
+ return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
2284
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
2285
+ }
2286
+
2287
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2288
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2289
+
2290
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
2291
+
2292
+ int v[VDR_Q8_0_Q8_1_MMVQ];
2293
+ int u[VDR_Q8_0_Q8_1_MMVQ];
2294
+
2295
+ #pragma unroll
2296
+ for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
2297
+ v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
2298
+ u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2299
+ }
2300
+
2301
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
2302
+ }
2303
+
2304
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2305
+
2306
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2307
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
2308
+
2309
+ *x_ql = tile_x_qs;
2310
+ *x_dm = (half2 *) tile_x_d;
2311
+ }
2312
+
2313
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2314
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2315
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2316
+
2317
+ __builtin_assume(i_offset >= 0);
2318
+ __builtin_assume(i_offset < nwarps);
2319
+ __builtin_assume(k >= 0);
2320
+ __builtin_assume(k < WARP_SIZE);
2321
+
2322
+ const int kbx = k / QI8_0;
2323
+ const int kqsx = k % QI8_0;
2324
+ float * x_dmf = (float *) x_dm;
2325
+
2326
+ const block_q8_0 * bx0 = (block_q8_0 *) vx;
2327
+
2328
+ #pragma unroll
2329
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2330
+ int i = i0 + i_offset;
2331
+
2332
+ if (need_check) {
2333
+ i = min(i, i_max);
2334
+ }
2335
+
2336
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
2337
+
2338
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
2339
+ }
2340
+
2341
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2342
+ const int kbxd = k % blocks_per_tile_x_row;
2343
+
2344
+ #pragma unroll
2345
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
2346
+ int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2347
+
2348
+ if (need_check) {
2349
+ i = min(i, i_max);
2350
+ }
2351
+
2352
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2353
+
2354
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
2355
+ }
2356
+ }
2357
+
2358
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2359
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2360
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2361
+
2362
+ const float * x_dmf = (const float *) x_dm;
2363
+ const float * y_df = (const float *) y_ds;
2364
+
2365
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
2366
+ (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
2367
+ y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2368
+ }
2369
+
2370
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2371
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2372
+
2373
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
2374
+
2375
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
2376
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2377
+
2378
+ const uint8_t * scales = bq2_K->scales + scale_offset;
2379
+
2380
+ const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
2381
+ int u[QR2_K];
2382
+ float d8[QR2_K];
2383
+
2384
+ #pragma unroll
2385
+ for (int i = 0; i < QR2_K; ++ i) {
2386
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2387
+ d8[i] = bq8_1[bq8_offset + i].ds.x;
2388
+ }
2389
+
2390
+ return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
2391
+ }
2392
+
2393
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2394
+
2395
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2396
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
2397
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2398
+
2399
+ *x_ql = tile_x_ql;
2400
+ *x_dm = tile_x_dm;
2401
+ *x_sc = tile_x_sc;
2402
+ }
2403
+
2404
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2405
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2406
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2407
+
2408
+ __builtin_assume(i_offset >= 0);
2409
+ __builtin_assume(i_offset < nwarps);
2410
+ __builtin_assume(k >= 0);
2411
+ __builtin_assume(k < WARP_SIZE);
2412
+
2413
+ const int kbx = k / QI2_K;
2414
+ const int kqsx = k % QI2_K;
2415
+
2416
+ const block_q2_K * bx0 = (block_q2_K *) vx;
2417
+
2418
+ #pragma unroll
2419
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2420
+ int i = i0 + i_offset;
2421
+
2422
+ if (need_check) {
2423
+ i = min(i, i_max);
2424
+ }
2425
+
2426
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
2427
+
2428
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2429
+ }
2430
+
2431
+ const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
2432
+ const int kbxd = k % blocks_per_tile_x_row;
2433
+
2434
+ #pragma unroll
2435
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
2436
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
2437
+
2438
+ if (need_check) {
2439
+ i = min(i, i_max);
2440
+ }
2441
+
2442
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
2443
+
2444
+ x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
2445
+ }
2446
+
2447
+ #pragma unroll
2448
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2449
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2450
+
2451
+ if (need_check) {
2452
+ i = min(i, i_max);
2453
+ }
2454
+
2455
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
2456
+
2457
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
2458
+ }
2459
+ }
2460
+
2461
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2462
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2463
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2464
+
2465
+ const int kbx = k / QI2_K;
2466
+ const int ky = (k % QI2_K) * QR2_K;
2467
+ const float * y_df = (const float *) y_ds;
2468
+
2469
+ int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
2470
+
2471
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
2472
+ const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
2473
+
2474
+ #pragma unroll
2475
+ for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
2476
+ v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
2477
+ }
2478
+
2479
+ const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
2480
+
2481
+ const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
2482
+ return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
2483
+ }
2484
+
2485
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2486
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2487
+
2488
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
2489
+
2490
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
2491
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2492
+
2493
+ const float d = bq3_K->d;
2494
+
2495
+ const int vl = get_int_from_uint8(bq3_K->qs, iqs);
2496
+
2497
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2498
+ const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
2499
+
2500
+ int u[QR3_K];
2501
+ float d8[QR3_K];
2502
+
2503
+ #pragma unroll
2504
+ for (int i = 0; i < QR3_K; ++i) {
2505
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2506
+ d8[i] = bq8_1[bq8_offset + i].ds.x;
2507
+ }
2508
+
2509
+ return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2510
+ }
2511
+
2512
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2513
+
2514
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2515
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
2516
+ __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
2517
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2518
+
2519
+ *x_ql = tile_x_ql;
2520
+ *x_dm = tile_x_dm;
2521
+ *x_qh = tile_x_qh;
2522
+ *x_sc = tile_x_sc;
2523
+ }
2524
+
2525
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2526
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2527
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2528
+
2529
+ __builtin_assume(i_offset >= 0);
2530
+ __builtin_assume(i_offset < nwarps);
2531
+ __builtin_assume(k >= 0);
2532
+ __builtin_assume(k < WARP_SIZE);
2533
+
2534
+ const int kbx = k / QI3_K;
2535
+ const int kqsx = k % QI3_K;
2536
+
2537
+ const block_q3_K * bx0 = (block_q3_K *) vx;
2538
+
2539
+ #pragma unroll
2540
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2541
+ int i = i0 + i_offset;
2542
+
2543
+ if (need_check) {
2544
+ i = min(i, i_max);
2545
+ }
2546
+
2547
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
2548
+
2549
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
2550
+ }
2551
+
2552
+ const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
2553
+ const int kbxd = k % blocks_per_tile_x_row;
2554
+ float * x_dmf = (float *) x_dm;
2555
+
2556
+ #pragma unroll
2557
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
2558
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
2559
+
2560
+ if (need_check) {
2561
+ i = min(i, i_max);
2562
+ }
2563
+
2564
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
2565
+
2566
+ x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
2567
+ }
2568
+
2569
+ #pragma unroll
2570
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
2571
+ int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
2572
+
2573
+ if (need_check) {
2574
+ i = min(i, i_max);
2575
+ }
2576
+
2577
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
2578
+
2579
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2580
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2581
+ }
2582
+
2583
+ #pragma unroll
2584
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2585
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2586
+
2587
+ if (need_check) {
2588
+ i = min(i, i_max);
2589
+ }
2590
+
2591
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
2592
+
2593
+ const int ksc = k % (QI3_K/4);
2594
+
2595
+ const int ksc_low = ksc % (QI3_K/8);
2596
+ const int shift_low = 4 * (ksc / (QI3_K/8));
2597
+ const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
2598
+
2599
+ const int ksc_high = QI3_K/8;
2600
+ const int shift_high = 2 * ksc;
2601
+ const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
2602
+
2603
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
2604
+
2605
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
2606
+ }
2607
+ }
2608
+
2609
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2610
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2611
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2612
+
2613
+ const int kbx = k / QI3_K;
2614
+ const int ky = (k % QI3_K) * QR3_K;
2615
+ const float * x_dmf = (const float *) x_dm;
2616
+ const float * y_df = (const float *) y_ds;
2617
+
2618
+ const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2619
+
2620
+ int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2621
+
2622
+ #pragma unroll
2623
+ for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
2624
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
2625
+ const int shift = 2 * ((ky % 32) / 8);
2626
+ const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
2627
+
2628
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
2629
+ const int vlh = (vh << 2) & 0x04040404;
2630
+
2631
+ v[l] = __vsubss4(vll, vlh);
2632
+ }
2633
+
2634
+ const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
2635
+ return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
2636
+ }
2637
+
2638
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2639
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2640
+
2641
+ #ifndef GGML_QKK_64
2642
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
2643
+
2644
+ int v[2];
2645
+ int u[2*QR4_K];
2646
+ float d8[QR4_K];
2647
+
2648
+ // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
2649
+ const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
2650
+
2651
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
2652
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
2653
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
2654
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
2655
+
2656
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
2657
+ v[0] = q4[0];
2658
+ v[1] = q4[4];
2659
+
2660
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales;
2661
+ uint16_t aux[2];
2662
+ const int j = bq8_offset/2;
2663
+ if (j < 2) {
2664
+ aux[0] = scales[j+0] & 0x3f3f;
2665
+ aux[1] = scales[j+2] & 0x3f3f;
2666
+ } else {
2667
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
2668
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
2669
+ }
2670
+ const uint8_t * sc = (const uint8_t *)aux;
2671
+ const uint8_t * m = sc + 2;
2672
+
2673
+ for (int i = 0; i < QR4_K; ++i) {
2674
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2675
+ d8[i] = bq8i->ds.x;
2676
+
2677
+ const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2678
+ u[2*i+0] = q8[0];
2679
+ u[2*i+1] = q8[4];
2680
+ }
2681
+
2682
+ return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
2683
+
2684
+ #else
2685
+
2686
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2687
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
2688
+
2689
+ float sumf_d = 0.0f;
2690
+ float sumf_m = 0.0f;
2691
+
2692
+ uint16_t aux16[2];
2693
+ const uint8_t * s = (const uint8_t *)aux16;
2694
+
2695
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
2696
+ aux16[0] = a[0] & 0x0f0f;
2697
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
2698
+
2699
+ const float dall = bq4_K->d[0];
2700
+ const float dmin = bq4_K->d[1];
2701
+
2702
+ const float d8_1 = bq8_1[0].ds.x;
2703
+ const float d8_2 = bq8_1[1].ds.x;
2704
+
2705
+ const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2706
+ const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
2707
+ const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
2708
+ const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
2709
+
2710
+ const int * q4 = (const int *)bq4_K->qs + (iqs/2);
2711
+ const int v1 = q4[0];
2712
+ const int v2 = q4[4];
2713
+
2714
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
2715
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
2716
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
2717
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
2718
+
2719
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
2720
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
2721
+
2722
+ return dall * sumf_d - dmin * sumf_m;
2723
+
2724
+ #else
2725
+ return 0.0f; // only to satisfy the compiler
2726
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2727
+
2728
+ #endif
2729
+ }
2730
+
2731
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2732
+
2733
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2734
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
2735
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2736
+
2737
+ *x_ql = tile_x_ql;
2738
+ *x_dm = tile_x_dm;
2739
+ *x_sc = tile_x_sc;
2740
+ }
2741
+
2742
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2743
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2744
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2745
+
2746
+ __builtin_assume(i_offset >= 0);
2747
+ __builtin_assume(i_offset < nwarps);
2748
+ __builtin_assume(k >= 0);
2749
+ __builtin_assume(k < WARP_SIZE);
2750
+
2751
+ const int kbx = k / QI4_K; // == 0 if QK_K == 256
2752
+ const int kqsx = k % QI4_K; // == k if QK_K == 256
2753
+
2754
+ const block_q4_K * bx0 = (block_q4_K *) vx;
2755
+
2756
+ #pragma unroll
2757
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2758
+ int i = i0 + i_offset;
2759
+
2760
+ if (need_check) {
2761
+ i = min(i, i_max);
2762
+ }
2763
+
2764
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
2765
+
2766
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2767
+ }
2768
+
2769
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2770
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2771
+
2772
+ #pragma unroll
2773
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
2774
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
2775
+
2776
+ if (need_check) {
2777
+ i = min(i, i_max);
2778
+ }
2779
+
2780
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2781
+
2782
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2783
+ }
2784
+
2785
+ #pragma unroll
2786
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
2787
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2788
+
2789
+ if (need_check) {
2790
+ i = min(i, i_max);
2791
+ }
2792
+
2793
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2794
+
2795
+ const int * scales = (int *) bxi->scales;
2796
+
2797
+ const int ksc = k % (WARP_SIZE/8);
2798
+
2799
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
2800
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
2801
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2802
+
2803
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
2804
+ }
2805
+ }
2806
+
2807
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2808
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2809
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2810
+
2811
+ int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
2812
+
2813
+ #pragma unroll
2814
+ for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
2815
+ v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
2816
+ v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
2817
+ }
2818
+
2819
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
2820
+
2821
+ const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
2822
+ return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2823
+ }
2824
+
2825
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2826
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2827
+
2828
+ #ifndef GGML_QKK_64
2829
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
2830
+
2831
+ int vl[2];
2832
+ int vh[2];
2833
+ int u[2*QR5_K];
2834
+ float d8[QR5_K];
2835
+
2836
+ const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
2837
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
2838
+ const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
2839
+
2840
+ vl[0] = ql[0];
2841
+ vl[1] = ql[4];
2842
+
2843
+ vh[0] = qh[0] >> bq8_offset;
2844
+ vh[1] = qh[4] >> bq8_offset;
2845
+
2846
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales;
2847
+ uint16_t aux[2];
2848
+ const int j = bq8_offset/2;
2849
+ if (j < 2) {
2850
+ aux[0] = scales[j+0] & 0x3f3f;
2851
+ aux[1] = scales[j+2] & 0x3f3f;
2852
+ } else {
2853
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
2854
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
2855
+ }
2856
+ const uint8_t * sc = (const uint8_t *)aux;
2857
+ const uint8_t * m = sc + 2;
2858
+
2859
+ #pragma unroll
2860
+ for (int i = 0; i < QR5_K; ++i) {
2861
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2862
+ d8[i] = bq8i->ds.x;
2863
+
2864
+ const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2865
+ u[2*i+0] = q8[0];
2866
+ u[2*i+1] = q8[4];
2867
+ }
2868
+
2869
+ return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
2870
+
2871
+ #else
2872
+
2873
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2874
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
2875
+
2876
+ const int8_t * s = bq5_K->scales;
2877
+
2878
+ const float d = bq5_K->d;
2879
+
2880
+ const float d8_1 = bq8_1[0].ds.x;
2881
+ const float d8_2 = bq8_1[1].ds.x;
2882
+
2883
+ const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2884
+ const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
2885
+ const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
2886
+ const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
2887
+
2888
+ const int * ql = (const int *)bq5_K->qs + (iqs/2);
2889
+ const int vl1 = ql[0];
2890
+ const int vl2 = ql[4];
2891
+
2892
+ const int step = 4 * (iqs/2); // 0, 4, 8, 12
2893
+ const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
2894
+ const int in = step%8; // 0, 4, 0, 4
2895
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
2896
+
2897
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
2898
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
2899
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
2900
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
2901
+
2902
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
2903
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
2904
+
2905
+ return d * sumf_d;
2906
+
2907
+ #else
2908
+ return 0.0f; // only to satisfy the compiler
2909
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2910
+
2911
+ #endif
2912
+ }
2913
+
2914
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2915
+
2916
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2917
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
2918
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2919
+
2920
+ *x_ql = tile_x_ql;
2921
+ *x_dm = tile_x_dm;
2922
+ *x_sc = tile_x_sc;
2923
+ }
2924
+
2925
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2926
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2927
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2928
+
2929
+ __builtin_assume(i_offset >= 0);
2930
+ __builtin_assume(i_offset < nwarps);
2931
+ __builtin_assume(k >= 0);
2932
+ __builtin_assume(k < WARP_SIZE);
2933
+
2934
+ const int kbx = k / QI5_K; // == 0 if QK_K == 256
2935
+ const int kqsx = k % QI5_K; // == k if QK_K == 256
2936
+
2937
+ const block_q5_K * bx0 = (block_q5_K *) vx;
2938
+
2939
+ #pragma unroll
2940
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2941
+ int i = i0 + i_offset;
2942
+
2943
+ if (need_check) {
2944
+ i = min(i, i_max);
2945
+ }
2946
+
2947
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
2948
+ const int ky = QR5_K*kqsx;
2949
+
2950
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
2951
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
2952
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2953
+
2954
+ const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
2955
+ const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
2956
+ const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
2957
+
2958
+ const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
2959
+ const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
2960
+
2961
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
2962
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
2963
+ }
2964
+
2965
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
2966
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2967
+
2968
+ #pragma unroll
2969
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
2970
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
2971
+
2972
+ if (need_check) {
2973
+ i = min(i, i_max);
2974
+ }
2975
+
2976
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
2977
+
2978
+ x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
2979
+ }
2980
+
2981
+ #pragma unroll
2982
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
2983
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2984
+
2985
+ if (need_check) {
2986
+ i = min(i, i_max);
2987
+ }
2988
+
2989
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2990
+
2991
+ const int * scales = (int *) bxi->scales;
2992
+
2993
+ const int ksc = k % (WARP_SIZE/8);
2994
+
2995
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
2996
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
2997
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2998
+
2999
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
3000
+ }
3001
+ }
3002
+
3003
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3004
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3005
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3006
+
3007
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3008
+
3009
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
3010
+ const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
3011
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
3012
+ }
3013
+
3014
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3015
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
3016
+
3017
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
3018
+
3019
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
3020
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
3021
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
3022
+
3023
+ const int vl = get_int_from_uint8(bq6_K->ql, iqs);
3024
+ const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
3025
+
3026
+ const int8_t * scales = bq6_K->scales + scale_offset;
3027
+
3028
+ int u[QR6_K];
3029
+ float d8[QR6_K];
3030
+
3031
+ #pragma unroll
3032
+ for (int i = 0; i < QR6_K; ++i) {
3033
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
3034
+ d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
3035
+ }
3036
+
3037
+ return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
3038
+ }
3039
+
3040
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3041
+
3042
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3043
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
3044
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
3045
+
3046
+ *x_ql = tile_x_ql;
3047
+ *x_dm = tile_x_dm;
3048
+ *x_sc = tile_x_sc;
3049
+ }
3050
+
3051
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3052
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3053
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3054
+
3055
+ __builtin_assume(i_offset >= 0);
3056
+ __builtin_assume(i_offset < nwarps);
3057
+ __builtin_assume(k >= 0);
3058
+ __builtin_assume(k < WARP_SIZE);
3059
+
3060
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
3061
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
3062
+
3063
+ const block_q6_K * bx0 = (block_q6_K *) vx;
3064
+
3065
+ #pragma unroll
3066
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
3067
+ int i = i0 + i_offset;
3068
+
3069
+ if (need_check) {
3070
+ i = min(i, i_max);
3071
+ }
3072
+
3073
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
3074
+ const int ky = QR6_K*kqsx;
3075
+
3076
+ const int ql = get_int_from_uint8(bxi->ql, kqsx);
3077
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
3078
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
3079
+
3080
+ const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
3081
+ const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
3082
+ const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
3083
+
3084
+ const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
3085
+ const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
3086
+
3087
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
3088
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
3089
+ }
3090
+
3091
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
3092
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
3093
+ float * x_dmf = (float *) x_dm;
3094
+
3095
+ #pragma unroll
3096
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
3097
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
3098
+
3099
+ if (need_check) {
3100
+ i = min(i, i_max);
3101
+ }
3102
+
3103
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
3104
+
3105
+ x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
3106
+ }
3107
+
3108
+ #pragma unroll
3109
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
3110
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
3111
+
3112
+ if (need_check) {
3113
+ i = min(i, i_max);
3114
+ }
3115
+
3116
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
3117
+
3118
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
3119
+ }
3120
+ }
3121
+
3122
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3123
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3124
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3125
+
3126
+ const float * x_dmf = (const float *) x_dm;
3127
+ const float * y_df = (const float *) y_ds;
3128
+
3129
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
3130
+
3131
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
3132
+ const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
3133
+ return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
3134
+ }
3135
+
3136
+ template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3137
+ allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3138
+ static __global__ void mul_mat_q(
3139
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3140
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3141
+
3142
+ const block_q_t * x = (const block_q_t *) vx;
3143
+ const block_q8_1 * y = (const block_q8_1 *) vy;
3144
+
3145
+ const int blocks_per_row_x = ncols_x / qk;
3146
+ const int blocks_per_col_y = nrows_y / QK8_1;
3147
+ const int blocks_per_warp = WARP_SIZE / qi;
3148
+
3149
+ const int & ncols_dst = ncols_y;
3150
+
3151
+ const int row_dst_0 = blockIdx.x*mmq_y;
3152
+ const int & row_x_0 = row_dst_0;
3153
+ const int row_dst = row_dst_0 + threadIdx.x;
3154
+
3155
+ const int col_dst_0 = blockIdx.y*mmq_x;
3156
+ const int & col_y_0 = col_dst_0;
3157
+
3158
+ int * tile_x_ql = nullptr;
3159
+ half2 * tile_x_dm = nullptr;
3160
+ int * tile_x_qh = nullptr;
3161
+ int * tile_x_sc = nullptr;
3162
+
3163
+ allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
3164
+
3165
+ __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3166
+ __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
1767
3167
 
1768
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1769
- const block_q6_K * bq6_K = (const block_q6_K *) vbq;
3168
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
1770
3169
 
1771
- const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1772
- const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1773
- const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
3170
+ for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
1774
3171
 
1775
- float sumf = 0.0f;
3172
+ load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
3173
+ threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
3174
+
3175
+ #pragma unroll
3176
+ for (int ir = 0; ir < qr; ++ir) {
3177
+ const int kqs = ir*WARP_SIZE + threadIdx.x;
3178
+ const int kbxd = kqs / QI8_1;
1776
3179
 
1777
- const float d = bq6_K->d;
3180
+ #pragma unroll
3181
+ for (int i = 0; i < mmq_x; i += nwarps) {
3182
+ const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
1778
3183
 
1779
- int vl;
1780
- memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
3184
+ const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
1781
3185
 
1782
- int vh;
1783
- memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
3186
+ const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
3187
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
3188
+ }
1784
3189
 
1785
- for (int i = 0; i < QR6_K; ++i) {
1786
- const int sc = bq6_K->scales[scale_offset + 4*i];
3190
+ #pragma unroll
3191
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
3192
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
3193
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
3194
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3195
+
3196
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
3197
+ const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
3198
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
3199
+ if (need_sum) {
3200
+ *dsi_dst = *dsi_src;
3201
+ } else {
3202
+ float * dfi_dst = (float *) dsi_dst;
3203
+ *dfi_dst = (*dsi_src).x;
3204
+ }
3205
+ }
1787
3206
 
1788
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
1789
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
1790
- const float d8i = bq8i->d;
3207
+ __syncthreads();
1791
3208
 
1792
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
3209
+ // #pragma unroll // unrolling this loop causes too much register pressure
3210
+ for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
3211
+ #pragma unroll
3212
+ for (int j = 0; j < mmq_x; j += nwarps) {
3213
+ #pragma unroll
3214
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3215
+ sum[i/WARP_SIZE][j/nwarps] += vec_dot(
3216
+ tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3217
+ threadIdx.x + i, threadIdx.y + j, k);
3218
+ }
3219
+ }
3220
+ }
1793
3221
 
1794
- const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
3222
+ __syncthreads();
3223
+ }
3224
+ }
1795
3225
 
1796
- const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1797
3226
 
1798
- sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
3227
+ if (row_dst >= nrows_dst) {
3228
+ return;
1799
3229
  }
1800
3230
 
1801
- return d*sumf;
1802
- #else
1803
- return 0.0f; // only to satisfy the compiler
1804
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3231
+ for (int j = 0; j < mmq_x; j += nwarps) {
3232
+ const int col_dst = col_dst_0 + j + threadIdx.y;
3233
+
3234
+ if (col_dst >= ncols_dst) {
3235
+ return;
3236
+ }
3237
+
3238
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3239
+ dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
3240
+ }
3241
+ }
1805
3242
  }
1806
3243
 
1807
- template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
3244
+ template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
1808
3245
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1809
3246
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1810
3247
 
@@ -1813,7 +3250,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1813
3250
  }
1814
3251
 
1815
3252
  const int blocks_per_row = ncols / qk;
1816
- const int blocks_per_warp = WARP_SIZE / qi;
3253
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
1817
3254
 
1818
3255
  // partial sum for each thread
1819
3256
  float tmp = 0.0f;
@@ -1822,11 +3259,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1822
3259
  const block_q8_1 * y = (const block_q8_1 *) vy;
1823
3260
 
1824
3261
  for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1825
- const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
3262
+ const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
1826
3263
 
1827
- const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
3264
+ const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
1828
3265
 
1829
- const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
3266
+ const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
1830
3267
 
1831
3268
  tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
1832
3269
  }
@@ -1859,11 +3296,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1859
3296
  const int y_offset = qr == 1 ? 1 : qk/2;
1860
3297
 
1861
3298
  // partial sum for each thread
1862
- #ifdef GGML_CUDA_DMMV_F16
3299
+ #ifdef GGML_CUDA_F16
1863
3300
  half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
1864
3301
  #else
1865
3302
  float tmp = 0.0f;
1866
- #endif // GGML_CUDA_DMMV_F16
3303
+ #endif // GGML_CUDA_F16
1867
3304
 
1868
3305
  for (int i = 0; i < ncols; i += iter_stride) {
1869
3306
  const int col = i + vals_per_iter*tid;
@@ -1883,7 +3320,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1883
3320
 
1884
3321
  // matrix multiplication
1885
3322
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
1886
- #ifdef GGML_CUDA_DMMV_F16
3323
+ #ifdef GGML_CUDA_F16
1887
3324
  tmp += __hmul2(v, {
1888
3325
  y[iybs + iqs + j/qr + 0],
1889
3326
  y[iybs + iqs + j/qr + y_offset]
@@ -1891,7 +3328,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1891
3328
  #else
1892
3329
  tmp += v.x * y[iybs + iqs + j/qr + 0];
1893
3330
  tmp += v.y * y[iybs + iqs + j/qr + y_offset];
1894
- #endif // GGML_CUDA_DMMV_F16
3331
+ #endif // GGML_CUDA_F16
1895
3332
  }
1896
3333
  }
1897
3334
 
@@ -1902,11 +3339,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1902
3339
  }
1903
3340
 
1904
3341
  if (tid == 0) {
1905
- #ifdef GGML_CUDA_DMMV_F16
3342
+ #ifdef GGML_CUDA_F16
1906
3343
  dst[row] = tmp.x + tmp.y;
1907
3344
  #else
1908
3345
  dst[row] = tmp;
1909
- #endif // GGML_CUDA_DMMV_F16
3346
+ #endif // GGML_CUDA_F16
1910
3347
  }
1911
3348
  }
1912
3349
 
@@ -2046,7 +3483,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
2046
3483
  }
2047
3484
 
2048
3485
  // rope == RoPE == rotary positional embedding
2049
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
3486
+ static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3487
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
2050
3488
  const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
2051
3489
 
2052
3490
  if (col >= ncols) {
@@ -2056,7 +3494,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
2056
3494
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
2057
3495
  const int i = row*ncols + col;
2058
3496
 
2059
- const float theta = p*powf(theta_scale, col/2);
3497
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
2060
3498
  const float sin_theta = sinf(theta);
2061
3499
  const float cos_theta = cosf(theta);
2062
3500
 
@@ -2203,9 +3641,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
2203
3641
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
2204
3642
  }
2205
3643
 
2206
- static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
2207
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
2208
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
3644
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
3645
+ const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
3646
+ const dim3 num_blocks(block_num_x, ky, 1);
3647
+ const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
3648
+ quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
2209
3649
  }
2210
3650
 
2211
3651
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2366,7 +3806,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
2366
3806
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2367
3807
  const dim3 block_nums(1, block_num_y, 1);
2368
3808
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2369
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
3809
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
2370
3810
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2371
3811
  }
2372
3812
 
@@ -2375,7 +3815,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
2375
3815
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2376
3816
  const dim3 block_nums(1, block_num_y, 1);
2377
3817
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2378
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
3818
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
2379
3819
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2380
3820
  }
2381
3821
 
@@ -2384,7 +3824,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
2384
3824
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2385
3825
  const dim3 block_nums(1, block_num_y, 1);
2386
3826
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2387
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
3827
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
2388
3828
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2389
3829
  }
2390
3830
 
@@ -2393,7 +3833,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
2393
3833
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2394
3834
  const dim3 block_nums(1, block_num_y, 1);
2395
3835
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2396
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
3836
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
2397
3837
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2398
3838
  }
2399
3839
 
@@ -2402,7 +3842,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
2402
3842
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2403
3843
  const dim3 block_nums(1, block_num_y, 1);
2404
3844
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2405
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
3845
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
2406
3846
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2407
3847
  }
2408
3848
 
@@ -2411,7 +3851,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
2411
3851
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2412
3852
  const dim3 block_nums(1, block_num_y, 1);
2413
3853
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2414
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
3854
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
2415
3855
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2416
3856
  }
2417
3857
 
@@ -2420,7 +3860,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
2420
3860
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2421
3861
  const dim3 block_nums(1, block_num_y, 1);
2422
3862
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2423
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
3863
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
2424
3864
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2425
3865
  }
2426
3866
 
@@ -2429,10 +3869,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
2429
3869
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2430
3870
  const dim3 block_nums(1, block_num_y, 1);
2431
3871
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2432
- // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
2433
- // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2434
- // is better amortized.
2435
- mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
3872
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
2436
3873
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2437
3874
  }
2438
3875
 
@@ -2441,10 +3878,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
2441
3878
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2442
3879
  const dim3 block_nums(1, block_num_y, 1);
2443
3880
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2444
- // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
2445
- // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2446
- // is better amortized.
2447
- mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
3881
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
2448
3882
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2449
3883
  }
2450
3884
 
@@ -2453,7 +3887,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
2453
3887
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2454
3888
  const dim3 block_nums(1, block_num_y, 1);
2455
3889
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2456
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
3890
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
2457
3891
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2458
3892
  }
2459
3893
 
@@ -2500,6 +3934,537 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
2500
3934
  }
2501
3935
  }
2502
3936
 
3937
+ static void ggml_mul_mat_q4_0_q8_1_cuda(
3938
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3939
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3940
+
3941
+ int id;
3942
+ CUDA_CHECK(cudaGetDevice(&id));
3943
+ const int compute_capability = g_compute_capabilities[id];
3944
+
3945
+ if (compute_capability >= CC_TURING) {
3946
+ const int mmq_x = 64;
3947
+ const int mmq_y = 128;
3948
+ const int nwarps = 4;
3949
+
3950
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3951
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3952
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3953
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3954
+
3955
+ if (nrows_x % mmq_y == 0) {
3956
+ const bool need_check = false;
3957
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3958
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3959
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3960
+ } else {
3961
+ const bool need_check = true;
3962
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3963
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3964
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3965
+ }
3966
+ } else {
3967
+ const int mmq_x = 64;
3968
+ const int mmq_y = 64;
3969
+ const int nwarps = 4;
3970
+
3971
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3972
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3973
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3974
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3975
+
3976
+ if (nrows_x % mmq_y == 0) {
3977
+ const bool need_check = false;
3978
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3979
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3980
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3981
+ } else {
3982
+ const bool need_check = true;
3983
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3984
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3985
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
+ }
3987
+ }
3988
+ }
3989
+
3990
+ static void ggml_mul_mat_q4_1_q8_1_cuda(
3991
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3992
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3993
+
3994
+ int id;
3995
+ CUDA_CHECK(cudaGetDevice(&id));
3996
+ const int compute_capability = g_compute_capabilities[id];
3997
+
3998
+ if (compute_capability >= CC_TURING) {
3999
+ const int mmq_x = 64;
4000
+ const int mmq_y = 128;
4001
+ const int nwarps = 4;
4002
+
4003
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4004
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4005
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4006
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4007
+
4008
+ if (nrows_x % mmq_y == 0) {
4009
+ const bool need_check = false;
4010
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4011
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4012
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4013
+ } else {
4014
+ const bool need_check = true;
4015
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4016
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4017
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4018
+ }
4019
+ } else {
4020
+ const int mmq_x = 64;
4021
+ const int mmq_y = 64;
4022
+ const int nwarps = 8;
4023
+
4024
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4025
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4026
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4027
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4028
+
4029
+ if (nrows_x % mmq_y == 0) {
4030
+ const bool need_check = false;
4031
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4032
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4033
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4034
+ } else {
4035
+ const bool need_check = true;
4036
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4037
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4038
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4039
+ }
4040
+
4041
+ }
4042
+ }
4043
+
4044
+ static void ggml_mul_mat_q5_0_q8_1_cuda(
4045
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4046
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4047
+
4048
+ int id;
4049
+ CUDA_CHECK(cudaGetDevice(&id));
4050
+ const int compute_capability = g_compute_capabilities[id];
4051
+
4052
+ if (compute_capability >= CC_TURING) {
4053
+ const int mmq_x = 128;
4054
+ const int mmq_y = 64;
4055
+ const int nwarps = 4;
4056
+
4057
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4058
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4059
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4060
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4061
+
4062
+ if (nrows_x % mmq_y == 0) {
4063
+ const bool need_check = false;
4064
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4065
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4066
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4067
+ } else {
4068
+ const bool need_check = true;
4069
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4070
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4071
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4072
+ }
4073
+ } else {
4074
+ const int mmq_x = 64;
4075
+ const int mmq_y = 64;
4076
+ const int nwarps = 8;
4077
+
4078
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4079
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4080
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4081
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4082
+
4083
+ if (nrows_x % mmq_y == 0) {
4084
+ const bool need_check = false;
4085
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4086
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4087
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4088
+ } else {
4089
+ const bool need_check = true;
4090
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4091
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4092
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4093
+ }
4094
+ }
4095
+ }
4096
+
4097
+ static void ggml_mul_mat_q5_1_q8_1_cuda(
4098
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4099
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4100
+
4101
+ int id;
4102
+ CUDA_CHECK(cudaGetDevice(&id));
4103
+ const int compute_capability = g_compute_capabilities[id];
4104
+
4105
+ if (compute_capability >= CC_TURING) {
4106
+ const int mmq_x = 128;
4107
+ const int mmq_y = 64;
4108
+ const int nwarps = 8;
4109
+
4110
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4111
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4112
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4113
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4114
+
4115
+ if (nrows_x % mmq_y == 0) {
4116
+ const bool need_check = false;
4117
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4118
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4119
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4120
+ } else {
4121
+ const bool need_check = true;
4122
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4123
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4124
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4125
+ }
4126
+ } else {
4127
+ const int mmq_x = 64;
4128
+ const int mmq_y = 64;
4129
+ const int nwarps = 8;
4130
+
4131
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4132
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4133
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4134
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4135
+
4136
+ if (nrows_x % mmq_y == 0) {
4137
+ const bool need_check = false;
4138
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4139
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4140
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4141
+ } else {
4142
+ const bool need_check = true;
4143
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4144
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4145
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4146
+ }
4147
+ }
4148
+ }
4149
+
4150
+ static void ggml_mul_mat_q8_0_q8_1_cuda(
4151
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4152
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4153
+
4154
+ int id;
4155
+ CUDA_CHECK(cudaGetDevice(&id));
4156
+ const int compute_capability = g_compute_capabilities[id];
4157
+
4158
+ if (compute_capability >= CC_TURING) {
4159
+ const int mmq_x = 128;
4160
+ const int mmq_y = 64;
4161
+ const int nwarps = 4;
4162
+
4163
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4164
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4165
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4166
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4167
+
4168
+ if (nrows_x % mmq_y == 0) {
4169
+ const bool need_check = false;
4170
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4171
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4172
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4173
+ } else {
4174
+ const bool need_check = true;
4175
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4176
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4177
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4178
+ }
4179
+ } else {
4180
+ const int mmq_x = 64;
4181
+ const int mmq_y = 64;
4182
+ const int nwarps = 8;
4183
+
4184
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4185
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4186
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4187
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4188
+
4189
+ if (nrows_x % mmq_y == 0) {
4190
+ const bool need_check = false;
4191
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4192
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4193
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4194
+ } else {
4195
+ const bool need_check = true;
4196
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4197
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4198
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4199
+ }
4200
+ }
4201
+ }
4202
+
4203
+ static void ggml_mul_mat_q2_K_q8_1_cuda(
4204
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4205
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4206
+
4207
+ int id;
4208
+ CUDA_CHECK(cudaGetDevice(&id));
4209
+ const int compute_capability = g_compute_capabilities[id];
4210
+
4211
+ if (compute_capability >= CC_TURING) {
4212
+ const int mmq_x = 64;
4213
+ const int mmq_y = 128;
4214
+ const int nwarps = 4;
4215
+
4216
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4217
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4218
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4219
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4220
+
4221
+ if (nrows_x % mmq_y == 0) {
4222
+ const bool need_check = false;
4223
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4224
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4225
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4226
+ } else {
4227
+ const bool need_check = true;
4228
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4229
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4230
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4231
+ }
4232
+ } else {
4233
+ const int mmq_x = 64;
4234
+ const int mmq_y = 64;
4235
+ const int nwarps = 8;
4236
+
4237
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4238
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4239
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4240
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4241
+
4242
+ if (nrows_x % mmq_y == 0) {
4243
+ const bool need_check = false;
4244
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4245
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4246
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4247
+ } else {
4248
+ const bool need_check = true;
4249
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4250
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4251
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4252
+ }
4253
+ }
4254
+ }
4255
+
4256
+ static void ggml_mul_mat_q3_K_q8_1_cuda(
4257
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4258
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4259
+
4260
+ int id;
4261
+ CUDA_CHECK(cudaGetDevice(&id));
4262
+ const int compute_capability = g_compute_capabilities[id];
4263
+
4264
+ if (compute_capability >= CC_TURING) {
4265
+ const int mmq_x = 128;
4266
+ const int mmq_y = 128;
4267
+ const int nwarps = 4;
4268
+
4269
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4270
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4271
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4272
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4273
+
4274
+ if (nrows_x % mmq_y == 0) {
4275
+ const bool need_check = false;
4276
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4277
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4278
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4279
+ } else {
4280
+ const bool need_check = true;
4281
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4282
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4283
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4284
+ }
4285
+ } else {
4286
+ const int mmq_x = 64;
4287
+ const int mmq_y = 64;
4288
+ const int nwarps = 8;
4289
+
4290
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4291
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4292
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4293
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4294
+
4295
+ if (nrows_x % mmq_y == 0) {
4296
+ const bool need_check = false;
4297
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4298
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4299
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4300
+ } else {
4301
+ const bool need_check = true;
4302
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4303
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4304
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4305
+ }
4306
+ }
4307
+ }
4308
+
4309
+ static void ggml_mul_mat_q4_K_q8_1_cuda(
4310
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4311
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4312
+
4313
+ int id;
4314
+ CUDA_CHECK(cudaGetDevice(&id));
4315
+ const int compute_capability = g_compute_capabilities[id];
4316
+
4317
+ if (compute_capability >= CC_TURING) {
4318
+ const int mmq_x = 64;
4319
+ const int mmq_y = 128;
4320
+ const int nwarps = 4;
4321
+
4322
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4323
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4324
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4325
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4326
+
4327
+ if (nrows_x % mmq_y == 0) {
4328
+ const bool need_check = false;
4329
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4330
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4331
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4332
+ } else {
4333
+ const bool need_check = true;
4334
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4335
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4336
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4337
+ }
4338
+ } else {
4339
+ const int mmq_x = 32;
4340
+ const int mmq_y = 64;
4341
+ const int nwarps = 8;
4342
+
4343
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4344
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4345
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4346
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4347
+
4348
+ if (nrows_x % mmq_y == 0) {
4349
+ const bool need_check = false;
4350
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4351
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4352
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4353
+ } else {
4354
+ const bool need_check = true;
4355
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4356
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4357
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4358
+ }
4359
+ }
4360
+ }
4361
+
4362
+ static void ggml_mul_mat_q5_K_q8_1_cuda(
4363
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4364
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4365
+
4366
+ int id;
4367
+ CUDA_CHECK(cudaGetDevice(&id));
4368
+ const int compute_capability = g_compute_capabilities[id];
4369
+
4370
+ if (compute_capability >= CC_TURING) {
4371
+ const int mmq_x = 64;
4372
+ const int mmq_y = 128;
4373
+ const int nwarps = 4;
4374
+
4375
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4376
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4377
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4378
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4379
+
4380
+ if (nrows_x % mmq_y == 0) {
4381
+ const bool need_check = false;
4382
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4383
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4384
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4385
+ } else {
4386
+ const bool need_check = true;
4387
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4388
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4389
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4390
+ }
4391
+ } else {
4392
+ const int mmq_x = 64;
4393
+ const int mmq_y = 64;
4394
+ const int nwarps = 8;
4395
+
4396
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4398
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4399
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4400
+
4401
+ if (nrows_x % mmq_y == 0) {
4402
+ const bool need_check = false;
4403
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4404
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4405
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4406
+ } else {
4407
+ const bool need_check = true;
4408
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4409
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4410
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4411
+ }
4412
+ }
4413
+ }
4414
+
4415
+ static void ggml_mul_mat_q6_K_q8_1_cuda(
4416
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4417
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4418
+
4419
+ int id;
4420
+ CUDA_CHECK(cudaGetDevice(&id));
4421
+ const int compute_capability = g_compute_capabilities[id];
4422
+
4423
+ if (compute_capability >= CC_TURING) {
4424
+ const int mmq_x = 64;
4425
+ const int mmq_y = 64;
4426
+ const int nwarps = 4;
4427
+
4428
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4429
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4430
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4431
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4432
+
4433
+ if (nrows_x % mmq_y == 0) {
4434
+ const bool need_check = false;
4435
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4436
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4437
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4438
+ } else {
4439
+ const bool need_check = true;
4440
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4441
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4442
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4443
+ }
4444
+ } else {
4445
+ const int mmq_x = 32;
4446
+ const int mmq_y = 64;
4447
+ const int nwarps = 8;
4448
+
4449
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4450
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4451
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4452
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4453
+
4454
+ if (nrows_x % mmq_y == 0) {
4455
+ const bool need_check = false;
4456
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4457
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4458
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4459
+ } else {
4460
+ const bool need_check = true;
4461
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4462
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4463
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4464
+ }
4465
+ }
4466
+ }
4467
+
2503
4468
  static void ggml_mul_mat_p021_f16_f32_cuda(
2504
4469
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
2505
4470
  const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
@@ -2544,12 +4509,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
2544
4509
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
2545
4510
  }
2546
4511
 
2547
- static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
4512
+ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4513
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
2548
4514
  GGML_ASSERT(nrows % 2 == 0);
2549
4515
  const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2550
4516
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
2551
4517
  const dim3 block_nums(num_blocks_x, nrows, 1);
2552
- rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
4518
+ rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
2553
4519
  }
2554
4520
 
2555
4521
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
@@ -2670,21 +4636,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
2670
4636
  }
2671
4637
 
2672
4638
 
2673
- static void * g_scratch_buffer = nullptr;
2674
- static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
2675
- static size_t g_scratch_offset = 0;
2676
-
2677
- static int g_device_count = -1;
2678
- static int g_main_device = 0;
2679
- #ifndef GGML_CUDA_FORCE_DMMV
2680
- static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
2681
- #endif
2682
- static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
2683
-
2684
- static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
2685
-
2686
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
2687
-
2688
4639
  void ggml_init_cublas() {
2689
4640
  static bool initialized = false;
2690
4641
 
@@ -2701,9 +4652,7 @@ void ggml_init_cublas() {
2701
4652
  g_tensor_split[id] = total_vram;
2702
4653
  total_vram += prop.totalGlobalMem;
2703
4654
 
2704
- #ifndef GGML_CUDA_FORCE_DMMV
2705
4655
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
2706
- #endif
2707
4656
  }
2708
4657
  for (int id = 0; id < g_device_count; ++id) {
2709
4658
  g_tensor_split[id] /= total_vram;
@@ -2965,6 +4914,114 @@ inline void ggml_cuda_op_rms_norm(
2965
4914
  (void) i1;
2966
4915
  }
2967
4916
 
4917
+ inline void ggml_cuda_op_mul_mat_q(
4918
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
4919
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
4920
+ cudaStream_t & cudaStream_main){
4921
+
4922
+ GGML_ASSERT(src0_ddq_i != nullptr);
4923
+ GGML_ASSERT(src1_ddf_i != nullptr);
4924
+ GGML_ASSERT(dst_ddf_i != nullptr);
4925
+
4926
+ const int64_t ne00 = src0->ne[0];
4927
+
4928
+ const int64_t ne10 = src1->ne[0];
4929
+ const int64_t ne11 = src1->ne[1];
4930
+ GGML_ASSERT(ne10 % QK8_1 == 0);
4931
+
4932
+ const int64_t ne0 = dst->ne[0];
4933
+
4934
+ const int64_t i01_diff = i01_high - i01_low;
4935
+
4936
+ int id;
4937
+ CUDA_CHECK(cudaGetDevice(&id));
4938
+
4939
+ // the main device has a larger memory buffer to hold the results from all GPUs
4940
+ // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
4941
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
4942
+
4943
+ const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
4944
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
4945
+ size_t as;
4946
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
4947
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
4948
+
4949
+ switch (src0->type) {
4950
+ case GGML_TYPE_Q4_0:
4951
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4952
+ break;
4953
+ case GGML_TYPE_Q4_1:
4954
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4955
+ break;
4956
+ case GGML_TYPE_Q5_0:
4957
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4958
+ break;
4959
+ case GGML_TYPE_Q5_1:
4960
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4961
+ break;
4962
+ case GGML_TYPE_Q8_0:
4963
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4964
+ break;
4965
+ case GGML_TYPE_Q2_K:
4966
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4967
+ break;
4968
+ case GGML_TYPE_Q3_K:
4969
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4970
+ break;
4971
+ case GGML_TYPE_Q4_K:
4972
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4973
+ break;
4974
+ case GGML_TYPE_Q5_K:
4975
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4976
+ break;
4977
+ case GGML_TYPE_Q6_K:
4978
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4979
+ break;
4980
+ default:
4981
+ GGML_ASSERT(false);
4982
+ break;
4983
+ }
4984
+
4985
+ ggml_cuda_pool_free(src1_q8_1, as);
4986
+
4987
+ (void) src1;
4988
+ (void) dst;
4989
+ (void) src0_ddf_i;
4990
+ (void) i02;
4991
+ (void) i1;
4992
+ }
4993
+
4994
+ static int64_t get_row_rounding(ggml_type type) {
4995
+ int max_compute_capability = INT_MIN;
4996
+ for (int id = 0; id < g_device_count; ++id) {
4997
+ if (max_compute_capability < g_compute_capabilities[id]
4998
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
4999
+ max_compute_capability = g_compute_capabilities[id];
5000
+ }
5001
+ }
5002
+
5003
+ switch(type) {
5004
+ case GGML_TYPE_Q4_0:
5005
+ case GGML_TYPE_Q4_1:
5006
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5007
+ case GGML_TYPE_Q5_0:
5008
+ case GGML_TYPE_Q5_1:
5009
+ case GGML_TYPE_Q8_0:
5010
+ return 64;
5011
+ case GGML_TYPE_F16:
5012
+ return 1;
5013
+ case GGML_TYPE_Q2_K:
5014
+ case GGML_TYPE_Q3_K:
5015
+ case GGML_TYPE_Q4_K:
5016
+ case GGML_TYPE_Q5_K:
5017
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5018
+ case GGML_TYPE_Q6_K:
5019
+ return 64;
5020
+ default:
5021
+ GGML_ASSERT(false);
5022
+ }
5023
+ }
5024
+
2968
5025
  inline void ggml_cuda_op_mul_mat_vec(
2969
5026
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2970
5027
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2979,6 +5036,7 @@ inline void ggml_cuda_op_mul_mat_vec(
2979
5036
 
2980
5037
  #ifdef GGML_CUDA_FORCE_DMMV
2981
5038
  const bool use_mul_mat_vec_q = false;
5039
+ (void) g_compute_capabilities[0];
2982
5040
  #else
2983
5041
  int id;
2984
5042
  CUDA_CHECK(cudaGetDevice(&id));
@@ -3006,7 +5064,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3006
5064
  ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
3007
5065
  size_t as;
3008
5066
  void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
3009
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
5067
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
3010
5068
 
3011
5069
  switch (src0->type) {
3012
5070
  case GGML_TYPE_Q4_0:
@@ -3047,7 +5105,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3047
5105
  ggml_cuda_pool_free(src1_q8_1, as);
3048
5106
  } else {
3049
5107
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
3050
- #ifdef GGML_CUDA_DMMV_F16
5108
+ #ifdef GGML_CUDA_F16
3051
5109
  size_t ash;
3052
5110
  dfloat * src1_dfloat = nullptr; // dfloat == half
3053
5111
 
@@ -3063,7 +5121,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3063
5121
  }
3064
5122
  #else
3065
5123
  dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
3066
- #endif // GGML_CUDA_DMMV_F16
5124
+ #endif // GGML_CUDA_F16
3067
5125
 
3068
5126
  switch (src0->type) {
3069
5127
  case GGML_TYPE_Q4_0:
@@ -3104,11 +5162,11 @@ inline void ggml_cuda_op_mul_mat_vec(
3104
5162
  break;
3105
5163
  }
3106
5164
 
3107
- #ifdef GGML_CUDA_DMMV_F16
5165
+ #ifdef GGML_CUDA_F16
3108
5166
  if (src1_convert_f16) {
3109
5167
  ggml_cuda_pool_free(src1_dfloat, ash);
3110
5168
  }
3111
- #endif // GGML_CUDA_DMMV_F16
5169
+ #endif // GGML_CUDA_F16
3112
5170
  }
3113
5171
 
3114
5172
  (void) src1;
@@ -3168,6 +5226,7 @@ inline void ggml_cuda_op_rope(
3168
5226
  GGML_ASSERT(dst_ddf_i != nullptr);
3169
5227
 
3170
5228
  const int64_t ne00 = src0->ne[0];
5229
+ const int64_t ne01 = src0->ne[1];
3171
5230
  const int64_t i01_diff = i01_high - i01_low;
3172
5231
 
3173
5232
  const int n_past = ((int32_t *) dst->op_params)[0];
@@ -3181,17 +5240,18 @@ inline void ggml_cuda_op_rope(
3181
5240
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
3182
5241
 
3183
5242
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
3184
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3185
5243
 
3186
- bool is_glm = mode & 4;
5244
+ const bool is_glm = mode & 4;
3187
5245
 
3188
5246
  // compute
3189
5247
  if (is_glm) {
5248
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3190
5249
  const float id_p = min(p, n_ctx - 2.f);
3191
5250
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
3192
5251
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
3193
5252
  } else {
3194
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
5253
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5254
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
3195
5255
  }
3196
5256
 
3197
5257
  (void) src1;
@@ -3362,8 +5422,17 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3362
5422
 
3363
5423
  int64_t row_low, row_high;
3364
5424
  if (split) {
5425
+ const int64_t rounding = get_row_rounding(src0->type);
5426
+
3365
5427
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
3366
- row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
5428
+ row_low -= row_low % rounding;
5429
+
5430
+ if (id == g_device_count - 1) {
5431
+ row_high = nrows0;
5432
+ } else {
5433
+ row_high = nrows0*g_tensor_split[id + 1];
5434
+ row_high -= row_high % rounding;
5435
+ }
3367
5436
  } else {
3368
5437
  row_low = 0;
3369
5438
  row_high = nrows0*i02_divisor;
@@ -3529,13 +5598,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3529
5598
  if (split) {
3530
5599
  // src0 = weight matrix is saved as a transposed matrix for better memory layout.
3531
5600
  // dst is NOT transposed.
3532
- // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
5601
+ // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
3533
5602
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
3534
5603
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
3535
- for (int64_t j = 0; j < ne1; ++j) {
3536
- float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
3537
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
3538
- }
5604
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
5605
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
5606
+ i01_diff*sizeof(float), ne1, kind, cudaStream_main));
3539
5607
  } else {
3540
5608
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
3541
5609
  CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
@@ -3576,7 +5644,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3576
5644
  if (split && g_device_count > 1) {
3577
5645
  CUDA_CHECK(cudaSetDevice(g_main_device));
3578
5646
  for (int id = 0; id < g_device_count; ++id) {
3579
- if (id != g_main_device) {
5647
+ if (id != g_main_device && src0_extra->events[id]) {
3580
5648
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
3581
5649
  }
3582
5650
  }
@@ -3718,7 +5786,19 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
3718
5786
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
3719
5787
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
3720
5788
  } else {
3721
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
5789
+ int min_compute_capability = INT_MAX;
5790
+ for (int id = 0; id < g_device_count; ++id) {
5791
+ if (min_compute_capability > g_compute_capabilities[id]
5792
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5793
+ min_compute_capability = g_compute_capabilities[id];
5794
+ }
5795
+ }
5796
+
5797
+ if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
5798
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
5799
+ } else {
5800
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
5801
+ }
3722
5802
  }
3723
5803
  } else {
3724
5804
  GGML_ASSERT(false);
@@ -3795,7 +5875,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
3795
5875
 
3796
5876
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3797
5877
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3798
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
5878
+
5879
+ const int mode = ((int32_t *) dst->op_params)[2];
5880
+ const bool is_glm = mode & 4;
5881
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
3799
5882
  }
3800
5883
 
3801
5884
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3827,8 +5910,17 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3827
5910
  row_low = 0;
3828
5911
  row_high = nrows;
3829
5912
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
5913
+ const int64_t rounding = get_row_rounding(tensor->type);
5914
+
3830
5915
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
3831
- row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
5916
+ row_low -= row_low % rounding;
5917
+
5918
+ if (id == g_device_count - 1) {
5919
+ row_high = nrows;
5920
+ } else {
5921
+ row_high = nrows*g_tensor_split[id + 1];
5922
+ row_high -= row_high % rounding;
5923
+ }
3832
5924
  } else {
3833
5925
  GGML_ASSERT(false);
3834
5926
  }
@@ -4002,6 +6094,10 @@ void ggml_cuda_set_main_device(int main_device) {
4002
6094
  }
4003
6095
  }
4004
6096
 
6097
+ void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
6098
+ g_mul_mat_q = mul_mat_q;
6099
+ }
6100
+
4005
6101
  void ggml_cuda_set_scratch_size(size_t scratch_size) {
4006
6102
  g_scratch_size = scratch_size;
4007
6103
  }