llama_cpp 0.3.5 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@
14
14
  #include "ggml.h"
15
15
 
16
16
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+ #define CC_TURING 700
17
18
 
18
19
  #if defined(_MSC_VER)
19
20
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -52,13 +53,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
52
53
  } while (0)
53
54
  #endif // CUDART_VERSION >= 11
54
55
 
55
- #ifdef GGML_CUDA_DMMV_F16
56
+ #ifdef GGML_CUDA_F16
56
57
  typedef half dfloat; // dequantize float
57
58
  typedef half2 dfloat2;
58
59
  #else
59
60
  typedef float dfloat; // dequantize float
60
61
  typedef float2 dfloat2;
61
- #endif //GGML_CUDA_DMMV_F16
62
+ #endif //GGML_CUDA_F16
63
+
64
+ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
65
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
66
+
67
+ int x32 = 0;
68
+ x32 |= x16[0] << 0;
69
+ x32 |= x16[1] << 16;
70
+
71
+ return x32;
72
+ }
73
+
74
+ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
75
+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
76
+
77
+ int x32 = 0;
78
+ x32 |= x16[0] << 0;
79
+ x32 |= x16[1] << 16;
80
+
81
+ return x32;
82
+ }
83
+
84
+ static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
85
+ return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
86
+ }
87
+
88
+ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
89
+ return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
90
+ }
62
91
 
63
92
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
64
93
  typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
@@ -87,8 +116,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
87
116
  #define QR4_1 2
88
117
  #define QI4_1 (QK4_1 / (4 * QR4_1))
89
118
  typedef struct {
90
- half d; // delta
91
- half m; // min
119
+ half2 dm; // dm.x = delta, dm.y = min
92
120
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
93
121
  } block_q4_1;
94
122
  static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -107,8 +135,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
107
135
  #define QR5_1 2
108
136
  #define QI5_1 (QK5_1 / (4 * QR5_1))
109
137
  typedef struct {
110
- half d; // delta
111
- half m; // min
138
+ half2 dm; // dm.x = delta, dm.y = min
112
139
  uint8_t qh[4]; // 5-th bit of quants
113
140
  uint8_t qs[QK5_1 / 2]; // nibbles / quants
114
141
  } block_q5_1;
@@ -127,13 +154,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
127
154
  #define QR8_1 1
128
155
  #define QI8_1 (QK8_1 / (4 * QR8_1))
129
156
  typedef struct {
130
- half d; // delta
131
- half s; // unquantized sum
157
+ half2 ds; // ds.x = delta, ds.y = sum
132
158
  int8_t qs[QK8_0]; // quants
133
159
  } block_q8_1;
134
160
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
135
161
 
136
- typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
162
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
163
+ typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
164
+ typedef void (*load_tiles_cuda_t)(
165
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
166
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
167
+ typedef float (*vec_dot_q_mul_mat_cuda_t)(
168
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
169
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
137
170
 
138
171
  //================================= k-quants
139
172
 
@@ -150,8 +183,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
150
183
  typedef struct {
151
184
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
152
185
  uint8_t qs[QK_K/4]; // quants
153
- half d; // super-block scale for quantized scales
154
- half dmin; // super-block scale for quantized mins
186
+ half2 dm; // super-block scale for quantized scales/mins
155
187
  } block_q2_K;
156
188
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
157
189
 
@@ -180,8 +212,7 @@ typedef struct {
180
212
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
181
213
  #else
182
214
  typedef struct {
183
- half d; // super-block scale for quantized scales
184
- half dmin; // super-block scale for quantized mins
215
+ half2 dm; // super-block scale for quantized scales/mins
185
216
  uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
186
217
  uint8_t qs[QK_K/2]; // 4--bit quants
187
218
  } block_q4_K;
@@ -200,11 +231,10 @@ typedef struct {
200
231
  static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
201
232
  #else
202
233
  typedef struct {
203
- half d; // super-block scale for quantized scales
204
- half dmin; // super-block scale for quantized mins
205
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
206
- uint8_t qh[QK_K/8]; // quants, high bit
207
- uint8_t qs[QK_K/2]; // quants, low 4 bits
234
+ half2 dm; // super-block scale for quantized scales/mins
235
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
236
+ uint8_t qh[QK_K/8]; // quants, high bit
237
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
208
238
  } block_q5_K;
209
239
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
210
240
  #endif
@@ -252,6 +282,20 @@ struct ggml_tensor_extra_gpu {
252
282
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
253
283
  };
254
284
 
285
+ static int g_device_count = -1;
286
+ static int g_main_device = 0;
287
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
+ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
+ static bool g_mul_mat_q = false;
290
+
291
+ static void * g_scratch_buffer = nullptr;
292
+ static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
293
+ static size_t g_scratch_offset = 0;
294
+
295
+ static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
296
+
297
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
298
+
255
299
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
256
300
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
257
301
 
@@ -367,33 +411,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
367
411
  v.x = vui & 0xF;
368
412
  v.y = vui >> 4;
369
413
 
370
- #ifdef GGML_CUDA_DMMV_F16
414
+ #ifdef GGML_CUDA_F16
371
415
  v = __hsub2(v, {8.0f, 8.0f});
372
416
  v = __hmul2(v, {d, d});
373
417
  #else
374
418
  v.x = (v.x - 8.0f) * d;
375
419
  v.y = (v.y - 8.0f) * d;
376
- #endif // GGML_CUDA_DMMV_F16
420
+ #endif // GGML_CUDA_F16
377
421
  }
378
422
 
379
423
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
380
424
  const block_q4_1 * x = (const block_q4_1 *) vx;
381
425
 
382
- const dfloat d = x[ib].d;
383
- const dfloat m = x[ib].m;
426
+ const dfloat d = x[ib].dm.x;
427
+ const dfloat m = x[ib].dm.y;
384
428
 
385
429
  const int vui = x[ib].qs[iqs];
386
430
 
387
431
  v.x = vui & 0xF;
388
432
  v.y = vui >> 4;
389
433
 
390
- #ifdef GGML_CUDA_DMMV_F16
434
+ #ifdef GGML_CUDA_F16
391
435
  v = __hmul2(v, {d, d});
392
436
  v = __hadd2(v, {m, m});
393
437
  #else
394
438
  v.x = (v.x * d) + m;
395
439
  v.y = (v.y * d) + m;
396
- #endif // GGML_CUDA_DMMV_F16
440
+ #endif // GGML_CUDA_F16
397
441
  }
398
442
 
399
443
  static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
@@ -410,20 +454,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
410
454
  v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
411
455
  v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
412
456
 
413
- #ifdef GGML_CUDA_DMMV_F16
457
+ #ifdef GGML_CUDA_F16
414
458
  v = __hsub2(v, {16.0f, 16.0f});
415
459
  v = __hmul2(v, {d, d});
416
460
  #else
417
461
  v.x = (v.x - 16.0f) * d;
418
462
  v.y = (v.y - 16.0f) * d;
419
- #endif // GGML_CUDA_DMMV_F16
463
+ #endif // GGML_CUDA_F16
420
464
  }
421
465
 
422
466
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
423
467
  const block_q5_1 * x = (const block_q5_1 *) vx;
424
468
 
425
- const dfloat d = x[ib].d;
426
- const dfloat m = x[ib].m;
469
+ const dfloat d = x[ib].dm.x;
470
+ const dfloat m = x[ib].dm.y;
427
471
 
428
472
  uint32_t qh;
429
473
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -434,13 +478,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
434
478
  v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
435
479
  v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
436
480
 
437
- #ifdef GGML_CUDA_DMMV_F16
481
+ #ifdef GGML_CUDA_F16
438
482
  v = __hmul2(v, {d, d});
439
483
  v = __hadd2(v, {m, m});
440
484
  #else
441
485
  v.x = (v.x * d) + m;
442
486
  v.y = (v.y * d) + m;
443
- #endif // GGML_CUDA_DMMV_F16
487
+ #endif // GGML_CUDA_F16
444
488
  }
445
489
 
446
490
  static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
@@ -451,12 +495,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
451
495
  v.x = x[ib].qs[iqs + 0];
452
496
  v.y = x[ib].qs[iqs + 1];
453
497
 
454
- #ifdef GGML_CUDA_DMMV_F16
498
+ #ifdef GGML_CUDA_F16
455
499
  v = __hmul2(v, {d, d});
456
500
  #else
457
501
  v.x *= d;
458
502
  v.y *= d;
459
- #endif // GGML_CUDA_DMMV_F16
503
+ #endif // GGML_CUDA_F16
460
504
  }
461
505
 
462
506
  //================================== k-quants
@@ -475,8 +519,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
475
519
  const uint8_t q = x[i].qs[32*n + l];
476
520
  float * y = yy + i*QK_K + 128*n;
477
521
 
478
- float dall = x[i].d;
479
- float dmin = x[i].dmin;
522
+ float dall = x[i].dm.x;
523
+ float dmin = x[i].dm.y;
480
524
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
481
525
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
482
526
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -486,8 +530,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
486
530
  const int il = tid%16; // 0...15
487
531
  const uint8_t q = x[i].qs[il] >> (2*is);
488
532
  float * y = yy + i*QK_K + 16*is + il;
489
- float dall = x[i].d;
490
- float dmin = x[i].dmin;
533
+ float dall = x[i].dm.x;
534
+ float dmin = x[i].dm.y;
491
535
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
492
536
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
493
537
  #endif
@@ -573,8 +617,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
573
617
 
574
618
  float * y = yy + i*QK_K + 64*il + n*ir;
575
619
 
576
- const float dall = x[i].d;
577
- const float dmin = x[i].dmin;
620
+ const float dall = x[i].dm.x;
621
+ const float dmin = x[i].dm.y;
578
622
 
579
623
  const uint8_t * q = x[i].qs + 32*il + n*ir;
580
624
 
@@ -612,8 +656,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
612
656
 
613
657
  float * y = yy + i*QK_K + 64*il + 2*ir;
614
658
 
615
- const float dall = x[i].d;
616
- const float dmin = x[i].dmin;
659
+ const float dall = x[i].dm.x;
660
+ const float dmin = x[i].dm.y;
617
661
 
618
662
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
619
663
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -725,8 +769,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
725
769
  const float * y = yy + i * QK_K + y_offset;
726
770
  const uint8_t * q = x[i].qs + q_offset;
727
771
 
728
- const float dall = x[i].d;
729
- const float dmin = x[i].dmin;
772
+ const float dall = x[i].dm.x;
773
+ const float dmin = x[i].dm.y;
730
774
 
731
775
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
732
776
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -768,9 +812,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
768
812
  uaux[0] = s[0] & 0x0f0f0f0f;
769
813
  uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
770
814
 
771
- const half2 * dh = (const half2 *)&x[i].d;
772
-
773
- const float2 dall = __half22float2(dh[0]);
815
+ const float2 dall = __half22float2(x[i].dm);
774
816
 
775
817
  float sum1 = 0, sum2 = 0;
776
818
  for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
@@ -948,8 +990,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
948
990
  const float * y1 = yy + i*QK_K + y_offset;
949
991
  const float * y2 = y1 + 128;
950
992
 
951
- const float dall = x[i].d;
952
- const float dmin = x[i].dmin;
993
+ const float dall = x[i].dm.x;
994
+ const float dmin = x[i].dm.y;
953
995
 
954
996
  const uint16_t * a = (const uint16_t *)x[i].scales;
955
997
  aux[0] = a[im+0] & kmask1;
@@ -1081,8 +1123,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1081
1123
  const float * y1 = yy + i*QK_K + y_offset;
1082
1124
  const float * y2 = y1 + 128;
1083
1125
 
1084
- const float dall = x[i].d;
1085
- const float dmin = x[i].dmin;
1126
+ const float dall = x[i].dm.x;
1127
+ const float dmin = x[i].dm.y;
1086
1128
 
1087
1129
  const uint16_t * a = (const uint16_t *)x[i].scales;
1088
1130
  aux[0] = a[im+0] & kmask1;
@@ -1270,19 +1312,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1270
1312
  v.y = x[ib + iqs + 1];
1271
1313
  }
1272
1314
 
1273
- static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1274
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
1315
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1316
+ const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1275
1317
 
1276
- if (i >= k) {
1318
+ if (ix >= kx_padded) {
1277
1319
  return;
1278
1320
  }
1279
1321
 
1322
+ const int iy = blockDim.y*blockIdx.y + threadIdx.y;
1323
+
1324
+ const int i_padded = iy*kx_padded + ix;
1325
+
1280
1326
  block_q8_1 * y = (block_q8_1 *) vy;
1281
1327
 
1282
- const int ib = i / QK8_1; // block index
1283
- const int iqs = i % QK8_1; // quant index
1328
+ const int ib = i_padded / QK8_1; // block index
1329
+ const int iqs = i_padded % QK8_1; // quant index
1284
1330
 
1285
- const float xi = i < ndata ? x[i] : 0.0f;
1331
+ const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
1286
1332
  float amax = fabsf(xi);
1287
1333
  float sum = xi;
1288
1334
 
@@ -1301,8 +1347,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1301
1347
  return;
1302
1348
  }
1303
1349
 
1304
- y[ib].d = d;
1305
- y[ib].s = sum;
1350
+ y[ib].ds.x = d;
1351
+ y[ib].ds.y = sum;
1306
1352
  }
1307
1353
 
1308
1354
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -1326,485 +1372,1876 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
1326
1372
  y[iybs + iqs + y_offset] = v.y;
1327
1373
  }
1328
1374
 
1329
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1330
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1331
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1332
- const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1375
+ // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
1376
+ // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
1377
+
1378
+ #define VDR_Q4_0_Q8_1_MMVQ 2
1379
+ #define VDR_Q4_0_Q8_1_MMQ 4
1380
+
1381
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
1382
+ const int * v, const int * u, const float & d4, const half2 & ds8) {
1333
1383
 
1334
- int vi;
1335
- memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1336
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1337
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1384
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1385
+ int sumi = 0;
1338
1386
 
1339
- const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
1387
+ #pragma unroll
1388
+ for (int i = 0; i < vdr; ++i) {
1389
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
1390
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
1340
1391
 
1341
- // subtract 8 from each quantized value
1342
- const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
1343
- const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
1392
+ // SIMD dot product of quantized values
1393
+ sumi = __dp4a(vi0, u[2*i+0], sumi);
1394
+ sumi = __dp4a(vi1, u[2*i+1], sumi);
1395
+ }
1344
1396
 
1345
- // SIMD dot product of quantized values
1346
- int sumi = __dp4a(vi0, ui0, 0);
1347
- sumi = __dp4a(vi1, ui1, sumi);
1397
+ const float2 ds8f = __half22float2(ds8);
1348
1398
 
1349
- return sumi*d;
1399
+ // second part effectively subtracts 8 from each quant value
1400
+ return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1350
1401
  #else
1351
1402
  return 0.0f; // only to satisfy the compiler
1352
1403
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1353
1404
  }
1354
1405
 
1355
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1356
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1357
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1358
- const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1406
+ #define VDR_Q4_1_Q8_1_MMVQ 2
1407
+ #define VDR_Q4_1_Q8_1_MMQ 4
1359
1408
 
1360
- const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1361
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1362
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
1409
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
1410
+ const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
1363
1411
 
1364
- const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
1365
- const float m = bq4_1->m;
1366
- const float s = bq8_1->s;
1412
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1413
+ int sumi = 0;
1367
1414
 
1368
- const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1369
- const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1415
+ #pragma unroll
1416
+ for (int i = 0; i < vdr; ++i) {
1417
+ const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
1418
+ const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
1370
1419
 
1371
- // SIMD dot product of quantized values
1372
- int sumi = __dp4a(vi0, ui0, 0);
1373
- sumi = __dp4a(vi1, ui1, sumi);
1420
+ // SIMD dot product of quantized values
1421
+ sumi = __dp4a(vi0, u[2*i+0], sumi);
1422
+ sumi = __dp4a(vi1, u[2*i+1], sumi);
1423
+ }
1374
1424
 
1375
- return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
1425
+ #ifdef GGML_CUDA_F16
1426
+ const float2 tmp = __half22float2(__hmul2(dm4, ds8));
1427
+ const float d4d8 = tmp.x;
1428
+ const float m4s8 = tmp.y;
1429
+ #else
1430
+ const float2 dm4f = __half22float2(dm4);
1431
+ const float2 ds8f = __half22float2(ds8);
1432
+ const float d4d8 = dm4f.x * ds8f.x;
1433
+ const float m4s8 = dm4f.y * ds8f.y;
1434
+ #endif // GGML_CUDA_F16
1435
+
1436
+ // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1437
+ return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1376
1438
  #else
1377
1439
  return 0.0f; // only to satisfy the compiler
1378
1440
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1379
1441
  }
1380
1442
 
1381
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1382
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1443
+ #define VDR_Q5_0_Q8_1_MMVQ 2
1444
+ #define VDR_Q5_0_Q8_1_MMQ 4
1445
+
1446
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
1447
+ const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
1448
+
1383
1449
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1384
- const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1450
+ int sumi = 0;
1451
+
1452
+ #pragma unroll
1453
+ for (int i = 0; i < vdr; ++i) {
1454
+ int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1455
+ vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
1456
+ vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
1457
+ vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
1458
+ vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
1459
+ sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
1460
+
1461
+ int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
1462
+ vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
1463
+ vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
1464
+ vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
1465
+ vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
1466
+ sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1467
+ }
1468
+
1469
+ const float2 ds8f = __half22float2(ds8);
1385
1470
 
1386
- int qs;
1387
- memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1388
- const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1389
- const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1390
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1391
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
1392
-
1393
- const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
1394
-
1395
- int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1396
- vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1397
- vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1398
- vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1399
- vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1400
- vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
1401
- int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1402
-
1403
- int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1404
- vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1405
- vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1406
- vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1407
- vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1408
- vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
1409
- sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1410
-
1411
- return sumi*d;
1471
+ // second part effectively subtracts 16 from each quant value
1472
+ return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1412
1473
  #else
1413
1474
  return 0.0f; // only to satisfy the compiler
1414
1475
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1415
1476
  }
1416
1477
 
1417
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1418
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1478
+ #define VDR_Q5_1_Q8_1_MMVQ 2
1479
+ #define VDR_Q5_1_Q8_1_MMQ 4
1480
+
1481
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
1482
+ const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
1483
+
1419
1484
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1420
- const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1485
+ int sumi = 0;
1486
+
1487
+ #pragma unroll
1488
+ for (int i = 0; i < vdr; ++i) {
1489
+ int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1490
+ vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
1491
+ vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
1492
+ vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
1493
+ vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
1494
+ sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
1495
+
1496
+ int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
1497
+ vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
1498
+ vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
1499
+ vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
1500
+ vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
1501
+ sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1502
+ }
1503
+
1504
+ #ifdef GGML_CUDA_F16
1505
+ const float2 tmp = __half22float2(__hmul2(dm5, ds8));
1506
+ const float d5d8 = tmp.x;
1507
+ const float m5s8 = tmp.y;
1508
+ #else
1509
+ const float2 dm5f = __half22float2(dm5);
1510
+ const float2 ds8f = __half22float2(ds8);
1511
+ const float d5d8 = dm5f.x * ds8f.x;
1512
+ const float m5s8 = dm5f.y * ds8f.y;
1513
+ #endif // GGML_CUDA_F16
1514
+
1515
+ // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
1516
+ return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1421
1517
 
1422
- const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
1423
- const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
1424
- const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
1425
- const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1426
- const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
1427
-
1428
- const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
1429
- const float m = bq5_1->m;
1430
- const float s = bq8_1->s;
1431
-
1432
- int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
1433
- vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
1434
- vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
1435
- vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
1436
- vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
1437
- int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
1438
-
1439
- int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
1440
- vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
1441
- vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
1442
- vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
1443
- vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
1444
- sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
1445
-
1446
- return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
1447
1518
  #else
1448
1519
  return 0.0f; // only to satisfy the compiler
1449
1520
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1450
1521
  }
1451
1522
 
1452
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1453
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1454
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1455
- const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1523
+ #define VDR_Q8_0_Q8_1_MMVQ 2
1524
+ #define VDR_Q8_0_Q8_1_MMQ 8
1456
1525
 
1457
- int vi;
1458
- memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1459
- const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1526
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
1527
+ const int * v, const int * u, const float & d8_0, const float & d8_1) {
1460
1528
 
1461
- const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
1529
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1530
+ int sumi = 0;
1462
1531
 
1463
- // SIMD dot product of quantized values
1464
- int sumi = __dp4a(vi, ui, 0);
1532
+ #pragma unroll
1533
+ for (int i = 0; i < vdr; ++i) {
1534
+ // SIMD dot product of quantized values
1535
+ sumi = __dp4a(v[i], u[i], sumi);
1536
+ }
1465
1537
 
1466
- return sumi*d;
1538
+ return d8_0*d8_1 * sumi;
1467
1539
  #else
1468
1540
  return 0.0f; // only to satisfy the compiler
1469
1541
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1470
1542
  }
1471
1543
 
1472
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
1473
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1544
+ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
1545
+ const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
1474
1546
 
1475
1547
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1476
- const block_q2_K * bq2_K = (const block_q2_K *) vbq;
1548
+ int sumi = 0;
1477
1549
 
1478
- const int bq8_offset = QR2_K * (iqs / QI8_1);
1479
- const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1550
+ #pragma unroll
1551
+ for (int i = 0; i < vdr; ++i) {
1552
+ // SIMD dot product of quantized values
1553
+ sumi = __dp4a(v[i], u[i], sumi);
1554
+ }
1480
1555
 
1481
- float sumf_d = 0.0f;
1482
- float sumf_m = 0.0f;
1556
+ #ifdef GGML_CUDA_F16
1557
+ const float2 tmp = __half22float2(__hmul2(dm8, ds8));
1558
+ const float d8d8 = tmp.x;
1559
+ const float m8s8 = tmp.y;
1560
+ #else
1561
+ const float2 dm8f = __half22float2(dm8);
1562
+ const float2 ds8f = __half22float2(ds8);
1563
+ const float d8d8 = dm8f.x * ds8f.x;
1564
+ const float m8s8 = dm8f.y * ds8f.y;
1565
+ #endif // GGML_CUDA_F16
1566
+
1567
+ // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1568
+ return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1569
+ #else
1570
+ return 0.0f; // only to satisfy the compiler
1571
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1572
+ }
1483
1573
 
1484
- const float d = bq2_K->d;
1485
- const float dmin = bq2_K->dmin;
1574
+ #define VDR_Q2_K_Q8_1_MMVQ 1
1575
+ #define VDR_Q2_K_Q8_1_MMQ 2
1486
1576
 
1487
- const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
1577
+ // contiguous v/x values
1578
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1579
+ const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1580
+ const half2 & dm2, const float * __restrict__ d8) {
1488
1581
 
1489
- for (int i = 0; i < QR2_K; ++i) {
1490
- const int sc = bq2_K->scales[scale_offset + 2*i];
1582
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1583
+ float sumf_d = 0.0f;
1584
+ float sumf_m = 0.0f;
1491
1585
 
1492
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1493
- const float d8i = bq8i->d;
1586
+ #pragma unroll
1587
+ for (int i = 0; i < QR2_K; ++i) {
1588
+ const int sc = scales[2*i];
1494
1589
 
1495
1590
  const int vi = (v >> (2*i)) & 0x03030303;
1496
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1497
1591
 
1498
- sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
1499
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
1592
+ sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
1593
+
1594
+ // fill int with 4x m
1595
+ int m = sc >> 4;
1596
+ m |= m << 8;
1597
+ m |= m << 16;
1598
+ sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
1500
1599
  }
1501
1600
 
1502
- return d*sumf_d - dmin*sumf_m;
1601
+ const float2 dm2f = __half22float2(dm2);
1602
+
1603
+ return dm2f.x*sumf_d - dm2f.y*sumf_m;
1503
1604
  #else
1504
1605
  return 0.0f; // only to satisfy the compiler
1505
1606
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1506
1607
  }
1507
1608
 
1508
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1509
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1609
+ // contiguous u/y values
1610
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
1611
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1612
+ const half2 & dm2, const float & d8) {
1510
1613
 
1511
1614
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1512
- const block_q3_K * bq3_K = (const block_q3_K *) vbq;
1615
+ int sumi_d = 0;
1616
+ int sumi_m = 0;
1513
1617
 
1514
- const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
1515
- const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
1618
+ #pragma unroll
1619
+ for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
1620
+ int sumi_d_sc = 0;
1516
1621
 
1517
- float sumf = 0.0f;
1622
+ const int sc = scales[i0 / (QI8_1/2)];
1518
1623
 
1519
- const float d = bq3_K->d;
1624
+ // fill int with 4x m
1625
+ int m = sc >> 4;
1626
+ m |= m << 8;
1627
+ m |= m << 16;
1628
+
1629
+ #pragma unroll
1630
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1631
+ sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
1632
+ sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
1633
+ }
1634
+
1635
+ sumi_d += sumi_d_sc * (sc & 0xF);
1636
+ }
1637
+
1638
+ const float2 dm2f = __half22float2(dm2);
1639
+
1640
+ return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
1641
+ #else
1642
+ return 0.0f; // only to satisfy the compiler
1643
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1644
+ }
1520
1645
 
1521
- int vl;
1522
- memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
1646
+ #define VDR_Q3_K_Q8_1_MMVQ 1
1647
+ #define VDR_Q3_K_Q8_1_MMQ 2
1523
1648
 
1524
- int vh;
1525
- memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
1526
- vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
1527
- vh >>= bq8_offset;
1649
+ // contiguous v/x values
1650
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
1651
+ const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1652
+ const int & scale_offset, const float & d3, const float * __restrict__ d8) {
1653
+
1654
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1655
+ float sumf = 0.0f;
1528
1656
 
1657
+ #pragma unroll
1529
1658
  for (int i = 0; i < QR3_K; ++i) {
1530
1659
  const int isc = scale_offset + 2*i;
1531
1660
 
1532
1661
  const int isc_low = isc % (QK_K/32);
1533
1662
  const int sc_shift_low = 4 * (isc / (QK_K/32));
1534
- const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
1663
+ const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
1535
1664
 
1536
1665
  const int isc_high = isc % (QK_K/64);
1537
1666
  const int sc_shift_high = 2 * (isc / (QK_K/64));
1538
- const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1667
+ const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1539
1668
 
1540
1669
  const int sc = (sc_low | sc_high) - 32;
1541
1670
 
1542
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1543
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1544
- const float d8i = bq8i->d;
1545
-
1546
1671
  const int vil = (vl >> (2*i)) & 0x03030303;
1547
1672
 
1548
1673
  const int vih = ((vh >> i) << 2) & 0x04040404;
1549
1674
 
1550
1675
  const int vi = __vsubss4(vil, vih);
1551
1676
 
1552
- sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1677
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1553
1678
  }
1554
1679
 
1555
- return d*sumf;
1680
+ return d3 * sumf;
1556
1681
  #else
1557
1682
  return 0.0f; // only to satisfy the compiler
1558
1683
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1559
1684
  }
1560
1685
 
1561
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1562
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1686
+ // contiguous u/y values
1687
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
1688
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
1689
+ const float & d3, const float & d8) {
1563
1690
 
1564
1691
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1565
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1692
+ int sumi = 0;
1566
1693
 
1567
- float sumf_d = 0.0f;
1568
- float sumf_m = 0.0f;
1694
+ #pragma unroll
1695
+ for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
1696
+ int sumi_sc = 0;
1569
1697
 
1570
- #ifndef GGML_QKK_64
1698
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1699
+ sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
1700
+ }
1571
1701
 
1572
- // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1573
- const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
1702
+ sumi += sumi_sc * scales[i0 / (QI8_1/2)];
1703
+ }
1574
1704
 
1575
- const float d = bq4_K->d;
1576
- const float dmin = bq4_K->dmin;
1705
+ return d3*d8 * sumi;
1706
+ #else
1707
+ return 0.0f; // only to satisfy the compiler
1708
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1709
+ }
1577
1710
 
1578
- // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
1579
- // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
1580
- // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
1581
- // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
1711
+ #define VDR_Q4_K_Q8_1_MMVQ 2
1712
+ #define VDR_Q4_K_Q8_1_MMQ 8
1582
1713
 
1583
- const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1584
- const int v1 = q4[0];
1585
- const int v2 = q4[4];
1714
+ // contiguous v/x values
1715
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
1716
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1717
+ const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
1586
1718
 
1587
- const uint16_t * scales = (const uint16_t *)bq4_K->scales;
1588
- uint16_t aux[2];
1589
- const int j = bq8_offset/2;
1590
- if (j < 2) {
1591
- aux[0] = scales[j+0] & 0x3f3f;
1592
- aux[1] = scales[j+2] & 0x3f3f;
1593
- } else {
1594
- aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1595
- aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1596
- }
1597
- const uint8_t * sc = (const uint8_t *)aux;
1598
- const uint8_t * m = sc + 2;
1719
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1720
+ float sumf_d = 0.0f;
1721
+ float sumf_m = 0.0f;
1599
1722
 
1723
+ #pragma unroll
1600
1724
  for (int i = 0; i < QR4_K; ++i) {
1725
+ const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
1726
+ const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
1601
1727
 
1602
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1603
- const float d8i = bq8i->d;
1604
- const int * q8 = (const int *)bq8i->qs + (iqs%4);
1605
- const int ui1 = q8[0];
1606
- const int ui2 = q8[4];
1607
-
1608
- const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
1609
- const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
1610
-
1611
- const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1612
- const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1728
+ const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
1729
+ const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
1613
1730
 
1614
- sumf_d += d8i * (dot1 * sc[i]);
1615
- sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1731
+ sumf_d += d8[i] * (dot1 * sc[i]);
1732
+ sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1616
1733
  }
1617
1734
 
1618
- return d*sumf_d - dmin*sumf_m;
1619
-
1620
- #else
1735
+ const float2 dm4f = __half22float2(dm4);
1621
1736
 
1622
- uint16_t aux16[2];
1623
- const uint8_t * s = (const uint8_t *)aux16;
1737
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1624
1738
 
1625
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
1626
- aux16[0] = a[0] & 0x0f0f;
1627
- aux16[1] = (a[0] >> 4) & 0x0f0f;
1739
+ #else
1740
+ return 0.0f; // only to satisfy the compiler
1741
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1742
+ }
1628
1743
 
1629
- const float dall = bq4_K->d[0];
1630
- const float dmin = bq4_K->d[1];
1744
+ // contiguous u/y values
1745
+ // also used for q5_K
1746
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1747
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1748
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1631
1749
 
1632
- const float d8_1 = bq8_1[0].d;
1633
- const float d8_2 = bq8_1[1].d;
1750
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1751
+ float sumf_d = 0.0f;
1752
+ float sumf_m = 0.0f;
1634
1753
 
1635
- const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1636
- const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1637
- const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1638
- const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1754
+ #pragma unroll
1755
+ for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
1756
+ int sumi_d = 0;
1639
1757
 
1640
- const int * q4 = (const int *)bq4_K->qs + iqs;
1641
- const int v1 = q4[0];
1642
- const int v2 = q4[4];
1758
+ #pragma unroll
1759
+ for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
1760
+ sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
1761
+ sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
1762
+ }
1643
1763
 
1644
- const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
1645
- const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
1646
- const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1647
- const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
1764
+ const float2 ds8f = __half22float2(ds8[i0 / 4]);
1648
1765
 
1649
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
1650
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
1766
+ sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
1767
+ sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
1768
+ }
1651
1769
 
1652
- return dall * sumf_d - dmin * sumf_m;
1770
+ const float2 dm4f = __half22float2(dm4);
1653
1771
 
1654
- #endif
1772
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1655
1773
 
1656
1774
  #else
1657
1775
  return 0.0f; // only to satisfy the compiler
1658
1776
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1659
1777
  }
1660
1778
 
1661
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1662
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1663
-
1664
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1665
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1666
-
1667
- #ifndef GGML_QKK_64
1779
+ #define VDR_Q5_K_Q8_1_MMVQ 2
1780
+ #define VDR_Q5_K_Q8_1_MMQ 8
1668
1781
 
1669
- const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
1670
- const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1671
- const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
1782
+ // contiguous v/x values
1783
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1784
+ const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1785
+ const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
1672
1786
 
1787
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1673
1788
  float sumf_d = 0.0f;
1674
1789
  float sumf_m = 0.0f;
1675
1790
 
1676
- const float d = bq5_K->d;
1677
- const float dmin = bq5_K->dmin;
1791
+ #pragma unroll
1792
+ for (int i = 0; i < QR5_K; ++i) {
1793
+ const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
1794
+ const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
1678
1795
 
1679
- const int vl1 = ql[0];
1680
- const int vl2 = ql[4];
1796
+ const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
1797
+ const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
1681
1798
 
1682
- const int vh1 = qh[0] >> bq8_offset;
1683
- const int vh2 = qh[4] >> bq8_offset;
1799
+ const int v0i = vl0i | vh0i;
1800
+ const int v1i = vl1i | vh1i;
1684
1801
 
1685
- const uint16_t * scales = (const uint16_t *)bq5_K->scales;
1686
- uint16_t aux[2];
1687
- const int j = bq8_offset/2;
1688
- if (j < 2) {
1689
- aux[0] = scales[j+0] & 0x3f3f;
1690
- aux[1] = scales[j+2] & 0x3f3f;
1691
- } else {
1692
- aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1693
- aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1694
- }
1695
- const uint8_t * sc = (const uint8_t *)aux;
1696
- const uint8_t * m = sc + 2;
1802
+ const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
1803
+ const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
1697
1804
 
1698
- for (int i = 0; i < QR5_K; ++i) {
1805
+ sumf_d += d8[i] * (dot1 * sc[i]);
1806
+ sumf_m += d8[i] * (dot2 * m[i]);
1699
1807
 
1700
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1701
- const float d8i = bq8i->d;
1702
- const int * q8 = (const int *)bq8i->qs + (iqs%4);
1703
- const int ui1 = q8[0];
1704
- const int ui2 = q8[4];
1808
+ }
1705
1809
 
1706
- const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
1707
- const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
1810
+ const float2 dm5f = __half22float2(dm5);
1708
1811
 
1709
- const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
1710
- const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
1812
+ return dm5f.x*sumf_d - dm5f.y*sumf_m;
1711
1813
 
1712
- const int vi1 = vil1 | vih1;
1713
- const int vi2 = vil2 | vih2;
1814
+ #else
1815
+ return 0.0f; // only to satisfy the compiler
1816
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1817
+ }
1714
1818
 
1715
- const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1716
- const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1819
+ #define VDR_Q6_K_Q8_1_MMVQ 1
1820
+ #define VDR_Q6_K_Q8_1_MMQ 8
1717
1821
 
1718
- sumf_d += d8i * (dot1 * sc[i]);
1719
- sumf_m += d8i * (dot2 * m[i]);
1822
+ // contiguous v/x values
1823
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
1824
+ const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
1825
+ const float & d, const float * __restrict__ d8) {
1720
1826
 
1721
- }
1827
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1828
+ float sumf = 0.0f;
1722
1829
 
1723
- return d*sumf_d - dmin*sumf_m;
1830
+ #pragma unroll
1831
+ for (int i = 0; i < QR6_K; ++i) {
1832
+ const int sc = scales[4*i];
1724
1833
 
1725
- #else
1834
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1726
1835
 
1727
- const int8_t * s = bq5_K->scales;
1836
+ const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
1728
1837
 
1729
- const float d = bq5_K->d;
1838
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1730
1839
 
1731
- const float d8_1 = bq8_1[0].d;
1732
- const float d8_2 = bq8_1[1].d;
1840
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1841
+ }
1733
1842
 
1734
- const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1735
- const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1736
- const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1737
- const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1843
+ return d*sumf;
1844
+ #else
1845
+ return 0.0f; // only to satisfy the compiler
1846
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1847
+ }
1738
1848
 
1739
- const int * ql = (const int *)bq5_K->qs + iqs;
1740
- const int vl1 = ql[0];
1741
- const int vl2 = ql[4];
1849
+ // contiguous u/y values
1850
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
1851
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
1852
+ const float & d6, const float * __restrict__ d8) {
1742
1853
 
1743
- const int step = 4 * iqs; // 0, 4, 8, 12
1744
- const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
1745
- const int in = step%8; // 0, 4, 0, 4
1746
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
1854
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1855
+ float sumf_d = 0.0f;
1747
1856
 
1748
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
1749
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
1750
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
1751
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
1857
+ #pragma unroll
1858
+ for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
1859
+ int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
1752
1860
 
1753
- const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
1754
- + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
1861
+ #pragma unroll
1862
+ for (int i = i0; i < i0 + 2; ++i) {
1863
+ sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
1864
+ sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
1755
1865
 
1756
- return d * sumf_d;
1866
+ sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
1867
+ sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
1868
+ }
1757
1869
 
1758
- #endif
1870
+ sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
1871
+ }
1872
+
1873
+ return d6 * sumf_d;
1759
1874
 
1760
1875
  #else
1761
1876
  return 0.0f; // only to satisfy the compiler
1762
1877
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1763
1878
  }
1764
1879
 
1765
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
1766
- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1880
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1881
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1882
+
1883
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1884
+
1885
+ int v[VDR_Q4_0_Q8_1_MMVQ];
1886
+ int u[2*VDR_Q4_0_Q8_1_MMVQ];
1887
+
1888
+ #pragma unroll
1889
+ for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
1890
+ v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
1891
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1892
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
1893
+ }
1894
+
1895
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
1896
+ }
1897
+
1898
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1899
+
1900
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
1901
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
1902
+
1903
+ *x_ql = tile_x_qs;
1904
+ *x_dm = (half2 *) tile_x_d;
1905
+ }
1906
+
1907
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1908
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1909
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1910
+
1911
+ __builtin_assume(i_offset >= 0);
1912
+ __builtin_assume(i_offset < nwarps);
1913
+ __builtin_assume(k >= 0);
1914
+ __builtin_assume(k < WARP_SIZE);
1915
+
1916
+ const int kbx = k / QI4_0;
1917
+ const int kqsx = k % QI4_0;
1918
+
1919
+ const block_q4_0 * bx0 = (block_q4_0 *) vx;
1920
+
1921
+ float * x_dmf = (float *) x_dm;
1922
+
1923
+ #pragma unroll
1924
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1925
+ int i = i0 + i_offset;
1926
+
1927
+ if (need_check) {
1928
+ i = min(i, i_max);
1929
+ }
1930
+
1931
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
1932
+
1933
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
1934
+ // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1935
+ }
1936
+
1937
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1938
+ const int kbxd = k % blocks_per_tile_x_row;
1939
+
1940
+ #pragma unroll
1941
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
1942
+ int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1943
+
1944
+ if (need_check) {
1945
+ i = min(i, i_max);
1946
+ }
1947
+
1948
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1949
+
1950
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
1951
+ }
1952
+ }
1953
+
1954
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1955
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1956
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1957
+
1958
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1959
+ const float * x_dmf = (float *) x_dm;
1960
+
1961
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
1962
+
1963
+ #pragma unroll
1964
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
1965
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
1966
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
1967
+ }
1968
+
1969
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
1970
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
1971
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1972
+ }
1973
+
1974
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1975
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1976
+
1977
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1978
+
1979
+ int v[VDR_Q4_1_Q8_1_MMVQ];
1980
+ int u[2*VDR_Q4_1_Q8_1_MMVQ];
1981
+
1982
+ #pragma unroll
1983
+ for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
1984
+ v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
1985
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1986
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
1987
+ }
1988
+
1989
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
1990
+ }
1991
+
1992
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1993
+
1994
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
1995
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
1996
+
1997
+ *x_ql = tile_x_qs;
1998
+ *x_dm = tile_x_dm;
1999
+ }
2000
+
2001
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2002
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2003
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2004
+
2005
+ __builtin_assume(i_offset >= 0);
2006
+ __builtin_assume(i_offset < nwarps);
2007
+ __builtin_assume(k >= 0);
2008
+ __builtin_assume(k < WARP_SIZE);
2009
+
2010
+ const int kbx = k / QI4_1;
2011
+ const int kqsx = k % QI4_1;
2012
+
2013
+ const block_q4_1 * bx0 = (block_q4_1 *) vx;
2014
+
2015
+ #pragma unroll
2016
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2017
+ int i = i0 + i_offset;
2018
+
2019
+ if (need_check) {
2020
+ i = min(i, i_max);
2021
+ }
2022
+
2023
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
2024
+
2025
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2026
+ }
2027
+
2028
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
2029
+ const int kbxd = k % blocks_per_tile_x_row;
2030
+
2031
+ #pragma unroll
2032
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
2033
+ int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
2034
+
2035
+ if (need_check) {
2036
+ i = min(i, i_max);
2037
+ }
2038
+
2039
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
2040
+
2041
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
2042
+ }
2043
+ }
2044
+
2045
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
2046
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2047
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2048
+
2049
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2050
+
2051
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
2052
+
2053
+ #pragma unroll
2054
+ for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
2055
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2056
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
2057
+ }
2058
+
2059
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
2060
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
2061
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
2062
+ }
2063
+
2064
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
2065
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2066
+
2067
+ const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
2068
+
2069
+ int vl[VDR_Q5_0_Q8_1_MMVQ];
2070
+ int vh[VDR_Q5_0_Q8_1_MMVQ];
2071
+ int u[2*VDR_Q5_0_Q8_1_MMVQ];
2072
+
2073
+ #pragma unroll
2074
+ for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
2075
+ vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
2076
+ vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
2077
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2078
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
2079
+ }
2080
+
2081
+ return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
2082
+ }
2083
+
2084
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2085
+
2086
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2087
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
2088
+
2089
+ *x_ql = tile_x_ql;
2090
+ *x_dm = (half2 *) tile_x_d;
2091
+ }
2092
+
2093
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2094
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2095
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2096
+
2097
+ __builtin_assume(i_offset >= 0);
2098
+ __builtin_assume(i_offset < nwarps);
2099
+ __builtin_assume(k >= 0);
2100
+ __builtin_assume(k < WARP_SIZE);
2101
+
2102
+ const int kbx = k / QI5_0;
2103
+ const int kqsx = k % QI5_0;
2104
+
2105
+ const block_q5_0 * bx0 = (block_q5_0 *) vx;
2106
+
2107
+ #pragma unroll
2108
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2109
+ int i = i0 + i_offset;
2110
+
2111
+ if (need_check) {
2112
+ i = min(i, i_max);
2113
+ }
2114
+
2115
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
2116
+
2117
+ const int ql = get_int_from_uint8(bxi->qs, kqsx);
2118
+ const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
2119
+
2120
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
2121
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
2122
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
2123
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
2124
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
2125
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
2126
+
2127
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
2128
+
2129
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
2130
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
2131
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
2132
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
2133
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
2134
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
2135
+
2136
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
2137
+ }
2138
+
2139
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
2140
+ const int kbxd = k % blocks_per_tile_x_row;
2141
+ float * x_dmf = (float *) x_dm;
2142
+
2143
+ #pragma unroll
2144
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
2145
+ int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
2146
+
2147
+ if (need_check) {
2148
+ i = min(i, i_max);
2149
+ }
2150
+
2151
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2152
+
2153
+ x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
2154
+ }
2155
+ }
2156
+
2157
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
2158
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2159
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2160
+
2161
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2162
+ const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
2163
+ const float * x_dmf = (const float *) x_dm;
2164
+ const float * y_df = (const float *) y_ds;
2165
+
2166
+ int u[2*VDR_Q5_0_Q8_1_MMQ];
2167
+
2168
+ #pragma unroll
2169
+ for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
2170
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2171
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
2172
+ }
2173
+
2174
+ return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
2175
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
2176
+ }
2177
+
2178
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2179
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2180
+
2181
+ const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
2182
+
2183
+ int vl[VDR_Q5_1_Q8_1_MMVQ];
2184
+ int vh[VDR_Q5_1_Q8_1_MMVQ];
2185
+ int u[2*VDR_Q5_1_Q8_1_MMVQ];
2186
+
2187
+ #pragma unroll
2188
+ for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
2189
+ vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
2190
+ vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
2191
+ u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2192
+ u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
2193
+ }
2194
+
2195
+ return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
2196
+ }
2197
+
2198
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2199
+
2200
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2201
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
2202
+
2203
+ *x_ql = tile_x_ql;
2204
+ *x_dm = tile_x_dm;
2205
+ }
2206
+
2207
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2208
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2209
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2210
+
2211
+ __builtin_assume(i_offset >= 0);
2212
+ __builtin_assume(i_offset < nwarps);
2213
+ __builtin_assume(k >= 0);
2214
+ __builtin_assume(k < WARP_SIZE);
2215
+
2216
+ const int kbx = k / QI5_1;
2217
+ const int kqsx = k % QI5_1;
2218
+
2219
+ const block_q5_1 * bx0 = (block_q5_1 *) vx;
2220
+
2221
+ #pragma unroll
2222
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2223
+ int i = i0 + i_offset;
2224
+
2225
+ if (need_check) {
2226
+ i = min(i, i_max);
2227
+ }
2228
+
2229
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
2230
+
2231
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
2232
+ const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
2233
+
2234
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
2235
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
2236
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
2237
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
2238
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
2239
+
2240
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
2241
+
2242
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
2243
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
2244
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
2245
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
2246
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
2247
+
2248
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
2249
+ }
2250
+
2251
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
2252
+ const int kbxd = k % blocks_per_tile_x_row;
2253
+
2254
+ #pragma unroll
2255
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
2256
+ int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
2257
+
2258
+ if (need_check) {
2259
+ i = min(i, i_max);
2260
+ }
2261
+
2262
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
2263
+
2264
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
2265
+ }
2266
+ }
2267
+
2268
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
2269
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2270
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2271
+
2272
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
2273
+ const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
2274
+
2275
+ int u[2*VDR_Q5_1_Q8_1_MMQ];
2276
+
2277
+ #pragma unroll
2278
+ for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
2279
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2280
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
2281
+ }
2282
+
2283
+ return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
2284
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
2285
+ }
2286
+
2287
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2288
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2289
+
2290
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
2291
+
2292
+ int v[VDR_Q8_0_Q8_1_MMVQ];
2293
+ int u[VDR_Q8_0_Q8_1_MMVQ];
2294
+
2295
+ #pragma unroll
2296
+ for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
2297
+ v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
2298
+ u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2299
+ }
2300
+
2301
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
2302
+ }
2303
+
2304
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2305
+
2306
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2307
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
2308
+
2309
+ *x_ql = tile_x_qs;
2310
+ *x_dm = (half2 *) tile_x_d;
2311
+ }
2312
+
2313
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2314
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2315
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2316
+
2317
+ __builtin_assume(i_offset >= 0);
2318
+ __builtin_assume(i_offset < nwarps);
2319
+ __builtin_assume(k >= 0);
2320
+ __builtin_assume(k < WARP_SIZE);
2321
+
2322
+ const int kbx = k / QI8_0;
2323
+ const int kqsx = k % QI8_0;
2324
+ float * x_dmf = (float *) x_dm;
2325
+
2326
+ const block_q8_0 * bx0 = (block_q8_0 *) vx;
2327
+
2328
+ #pragma unroll
2329
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2330
+ int i = i0 + i_offset;
2331
+
2332
+ if (need_check) {
2333
+ i = min(i, i_max);
2334
+ }
2335
+
2336
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
2337
+
2338
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
2339
+ }
2340
+
2341
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2342
+ const int kbxd = k % blocks_per_tile_x_row;
2343
+
2344
+ #pragma unroll
2345
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
2346
+ int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2347
+
2348
+ if (need_check) {
2349
+ i = min(i, i_max);
2350
+ }
2351
+
2352
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2353
+
2354
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
2355
+ }
2356
+ }
2357
+
2358
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2359
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2360
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2361
+
2362
+ const float * x_dmf = (const float *) x_dm;
2363
+ const float * y_df = (const float *) y_ds;
2364
+
2365
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
2366
+ (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
2367
+ y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2368
+ }
2369
+
2370
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2371
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2372
+
2373
+ const block_q2_K * bq2_K = (const block_q2_K *) vbq;
2374
+
2375
+ const int bq8_offset = QR2_K * (iqs / QI8_1);
2376
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2377
+
2378
+ const uint8_t * scales = bq2_K->scales + scale_offset;
2379
+
2380
+ const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
2381
+ int u[QR2_K];
2382
+ float d8[QR2_K];
2383
+
2384
+ #pragma unroll
2385
+ for (int i = 0; i < QR2_K; ++ i) {
2386
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2387
+ d8[i] = bq8_1[bq8_offset + i].ds.x;
2388
+ }
2389
+
2390
+ return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
2391
+ }
2392
+
2393
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2394
+
2395
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2396
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
2397
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2398
+
2399
+ *x_ql = tile_x_ql;
2400
+ *x_dm = tile_x_dm;
2401
+ *x_sc = tile_x_sc;
2402
+ }
2403
+
2404
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2405
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2406
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2407
+
2408
+ __builtin_assume(i_offset >= 0);
2409
+ __builtin_assume(i_offset < nwarps);
2410
+ __builtin_assume(k >= 0);
2411
+ __builtin_assume(k < WARP_SIZE);
2412
+
2413
+ const int kbx = k / QI2_K;
2414
+ const int kqsx = k % QI2_K;
2415
+
2416
+ const block_q2_K * bx0 = (block_q2_K *) vx;
2417
+
2418
+ #pragma unroll
2419
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2420
+ int i = i0 + i_offset;
2421
+
2422
+ if (need_check) {
2423
+ i = min(i, i_max);
2424
+ }
2425
+
2426
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
2427
+
2428
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2429
+ }
2430
+
2431
+ const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
2432
+ const int kbxd = k % blocks_per_tile_x_row;
2433
+
2434
+ #pragma unroll
2435
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
2436
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
2437
+
2438
+ if (need_check) {
2439
+ i = min(i, i_max);
2440
+ }
2441
+
2442
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
2443
+
2444
+ x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
2445
+ }
2446
+
2447
+ #pragma unroll
2448
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2449
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2450
+
2451
+ if (need_check) {
2452
+ i = min(i, i_max);
2453
+ }
2454
+
2455
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
2456
+
2457
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
2458
+ }
2459
+ }
2460
+
2461
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2462
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2463
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2464
+
2465
+ const int kbx = k / QI2_K;
2466
+ const int ky = (k % QI2_K) * QR2_K;
2467
+ const float * y_df = (const float *) y_ds;
2468
+
2469
+ int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
2470
+
2471
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
2472
+ const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
2473
+
2474
+ #pragma unroll
2475
+ for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
2476
+ v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
2477
+ }
2478
+
2479
+ const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
2480
+
2481
+ const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
2482
+ return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
2483
+ }
2484
+
2485
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2486
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2487
+
2488
+ const block_q3_K * bq3_K = (const block_q3_K *) vbq;
2489
+
2490
+ const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
2491
+ const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
2492
+
2493
+ const float d = bq3_K->d;
2494
+
2495
+ const int vl = get_int_from_uint8(bq3_K->qs, iqs);
2496
+
2497
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2498
+ const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
2499
+
2500
+ int u[QR3_K];
2501
+ float d8[QR3_K];
2502
+
2503
+ #pragma unroll
2504
+ for (int i = 0; i < QR3_K; ++i) {
2505
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2506
+ d8[i] = bq8_1[bq8_offset + i].ds.x;
2507
+ }
2508
+
2509
+ return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2510
+ }
2511
+
2512
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2513
+
2514
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2515
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
2516
+ __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
2517
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2518
+
2519
+ *x_ql = tile_x_ql;
2520
+ *x_dm = tile_x_dm;
2521
+ *x_qh = tile_x_qh;
2522
+ *x_sc = tile_x_sc;
2523
+ }
2524
+
2525
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2526
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2527
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2528
+
2529
+ __builtin_assume(i_offset >= 0);
2530
+ __builtin_assume(i_offset < nwarps);
2531
+ __builtin_assume(k >= 0);
2532
+ __builtin_assume(k < WARP_SIZE);
2533
+
2534
+ const int kbx = k / QI3_K;
2535
+ const int kqsx = k % QI3_K;
2536
+
2537
+ const block_q3_K * bx0 = (block_q3_K *) vx;
2538
+
2539
+ #pragma unroll
2540
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2541
+ int i = i0 + i_offset;
2542
+
2543
+ if (need_check) {
2544
+ i = min(i, i_max);
2545
+ }
2546
+
2547
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
2548
+
2549
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
2550
+ }
2551
+
2552
+ const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
2553
+ const int kbxd = k % blocks_per_tile_x_row;
2554
+ float * x_dmf = (float *) x_dm;
2555
+
2556
+ #pragma unroll
2557
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
2558
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
2559
+
2560
+ if (need_check) {
2561
+ i = min(i, i_max);
2562
+ }
2563
+
2564
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
2565
+
2566
+ x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
2567
+ }
2568
+
2569
+ #pragma unroll
2570
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
2571
+ int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
2572
+
2573
+ if (need_check) {
2574
+ i = min(i, i_max);
2575
+ }
2576
+
2577
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
2578
+
2579
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2580
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2581
+ }
2582
+
2583
+ #pragma unroll
2584
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2585
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2586
+
2587
+ if (need_check) {
2588
+ i = min(i, i_max);
2589
+ }
2590
+
2591
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
2592
+
2593
+ const int ksc = k % (QI3_K/4);
2594
+
2595
+ const int ksc_low = ksc % (QI3_K/8);
2596
+ const int shift_low = 4 * (ksc / (QI3_K/8));
2597
+ const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
2598
+
2599
+ const int ksc_high = QI3_K/8;
2600
+ const int shift_high = 2 * ksc;
2601
+ const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
2602
+
2603
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
2604
+
2605
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
2606
+ }
2607
+ }
2608
+
2609
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2610
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2611
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2612
+
2613
+ const int kbx = k / QI3_K;
2614
+ const int ky = (k % QI3_K) * QR3_K;
2615
+ const float * x_dmf = (const float *) x_dm;
2616
+ const float * y_df = (const float *) y_ds;
2617
+
2618
+ const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2619
+
2620
+ int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2621
+
2622
+ #pragma unroll
2623
+ for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
2624
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
2625
+ const int shift = 2 * ((ky % 32) / 8);
2626
+ const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
2627
+
2628
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
2629
+ const int vlh = (vh << 2) & 0x04040404;
2630
+
2631
+ v[l] = __vsubss4(vll, vlh);
2632
+ }
2633
+
2634
+ const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
2635
+ return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
2636
+ }
2637
+
2638
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2639
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2640
+
2641
+ #ifndef GGML_QKK_64
2642
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
2643
+
2644
+ int v[2];
2645
+ int u[2*QR4_K];
2646
+ float d8[QR4_K];
2647
+
2648
+ // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
2649
+ const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
2650
+
2651
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
2652
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
2653
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
2654
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
2655
+
2656
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
2657
+ v[0] = q4[0];
2658
+ v[1] = q4[4];
2659
+
2660
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales;
2661
+ uint16_t aux[2];
2662
+ const int j = bq8_offset/2;
2663
+ if (j < 2) {
2664
+ aux[0] = scales[j+0] & 0x3f3f;
2665
+ aux[1] = scales[j+2] & 0x3f3f;
2666
+ } else {
2667
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
2668
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
2669
+ }
2670
+ const uint8_t * sc = (const uint8_t *)aux;
2671
+ const uint8_t * m = sc + 2;
2672
+
2673
+ for (int i = 0; i < QR4_K; ++i) {
2674
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2675
+ d8[i] = bq8i->ds.x;
2676
+
2677
+ const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2678
+ u[2*i+0] = q8[0];
2679
+ u[2*i+1] = q8[4];
2680
+ }
2681
+
2682
+ return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
2683
+
2684
+ #else
2685
+
2686
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2687
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
2688
+
2689
+ float sumf_d = 0.0f;
2690
+ float sumf_m = 0.0f;
2691
+
2692
+ uint16_t aux16[2];
2693
+ const uint8_t * s = (const uint8_t *)aux16;
2694
+
2695
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
2696
+ aux16[0] = a[0] & 0x0f0f;
2697
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
2698
+
2699
+ const float dall = bq4_K->d[0];
2700
+ const float dmin = bq4_K->d[1];
2701
+
2702
+ const float d8_1 = bq8_1[0].ds.x;
2703
+ const float d8_2 = bq8_1[1].ds.x;
2704
+
2705
+ const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2706
+ const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
2707
+ const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
2708
+ const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
2709
+
2710
+ const int * q4 = (const int *)bq4_K->qs + (iqs/2);
2711
+ const int v1 = q4[0];
2712
+ const int v2 = q4[4];
2713
+
2714
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
2715
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
2716
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
2717
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
2718
+
2719
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
2720
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
2721
+
2722
+ return dall * sumf_d - dmin * sumf_m;
2723
+
2724
+ #else
2725
+ return 0.0f; // only to satisfy the compiler
2726
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2727
+
2728
+ #endif
2729
+ }
2730
+
2731
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2732
+
2733
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2734
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
2735
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2736
+
2737
+ *x_ql = tile_x_ql;
2738
+ *x_dm = tile_x_dm;
2739
+ *x_sc = tile_x_sc;
2740
+ }
2741
+
2742
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2743
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2744
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2745
+
2746
+ __builtin_assume(i_offset >= 0);
2747
+ __builtin_assume(i_offset < nwarps);
2748
+ __builtin_assume(k >= 0);
2749
+ __builtin_assume(k < WARP_SIZE);
2750
+
2751
+ const int kbx = k / QI4_K; // == 0 if QK_K == 256
2752
+ const int kqsx = k % QI4_K; // == k if QK_K == 256
2753
+
2754
+ const block_q4_K * bx0 = (block_q4_K *) vx;
2755
+
2756
+ #pragma unroll
2757
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2758
+ int i = i0 + i_offset;
2759
+
2760
+ if (need_check) {
2761
+ i = min(i, i_max);
2762
+ }
2763
+
2764
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
2765
+
2766
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2767
+ }
2768
+
2769
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2770
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2771
+
2772
+ #pragma unroll
2773
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
2774
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
2775
+
2776
+ if (need_check) {
2777
+ i = min(i, i_max);
2778
+ }
2779
+
2780
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2781
+
2782
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2783
+ }
2784
+
2785
+ #pragma unroll
2786
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
2787
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2788
+
2789
+ if (need_check) {
2790
+ i = min(i, i_max);
2791
+ }
2792
+
2793
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2794
+
2795
+ const int * scales = (int *) bxi->scales;
2796
+
2797
+ const int ksc = k % (WARP_SIZE/8);
2798
+
2799
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
2800
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
2801
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2802
+
2803
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
2804
+ }
2805
+ }
2806
+
2807
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2808
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2809
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2810
+
2811
+ int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
2812
+
2813
+ #pragma unroll
2814
+ for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
2815
+ v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
2816
+ v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
2817
+ }
2818
+
2819
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
2820
+
2821
+ const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
2822
+ return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2823
+ }
2824
+
2825
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2826
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
2827
+
2828
+ #ifndef GGML_QKK_64
2829
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
2830
+
2831
+ int vl[2];
2832
+ int vh[2];
2833
+ int u[2*QR5_K];
2834
+ float d8[QR5_K];
2835
+
2836
+ const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
2837
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
2838
+ const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
2839
+
2840
+ vl[0] = ql[0];
2841
+ vl[1] = ql[4];
2842
+
2843
+ vh[0] = qh[0] >> bq8_offset;
2844
+ vh[1] = qh[4] >> bq8_offset;
2845
+
2846
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales;
2847
+ uint16_t aux[2];
2848
+ const int j = bq8_offset/2;
2849
+ if (j < 2) {
2850
+ aux[0] = scales[j+0] & 0x3f3f;
2851
+ aux[1] = scales[j+2] & 0x3f3f;
2852
+ } else {
2853
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
2854
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
2855
+ }
2856
+ const uint8_t * sc = (const uint8_t *)aux;
2857
+ const uint8_t * m = sc + 2;
2858
+
2859
+ #pragma unroll
2860
+ for (int i = 0; i < QR5_K; ++i) {
2861
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2862
+ d8[i] = bq8i->ds.x;
2863
+
2864
+ const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2865
+ u[2*i+0] = q8[0];
2866
+ u[2*i+1] = q8[4];
2867
+ }
2868
+
2869
+ return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
2870
+
2871
+ #else
2872
+
2873
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2874
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
2875
+
2876
+ const int8_t * s = bq5_K->scales;
2877
+
2878
+ const float d = bq5_K->d;
2879
+
2880
+ const float d8_1 = bq8_1[0].ds.x;
2881
+ const float d8_2 = bq8_1[1].ds.x;
2882
+
2883
+ const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2884
+ const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
2885
+ const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
2886
+ const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
2887
+
2888
+ const int * ql = (const int *)bq5_K->qs + (iqs/2);
2889
+ const int vl1 = ql[0];
2890
+ const int vl2 = ql[4];
2891
+
2892
+ const int step = 4 * (iqs/2); // 0, 4, 8, 12
2893
+ const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
2894
+ const int in = step%8; // 0, 4, 0, 4
2895
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
2896
+
2897
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
2898
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
2899
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
2900
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
2901
+
2902
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
2903
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
2904
+
2905
+ return d * sumf_d;
2906
+
2907
+ #else
2908
+ return 0.0f; // only to satisfy the compiler
2909
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2910
+
2911
+ #endif
2912
+ }
2913
+
2914
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2915
+
2916
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2917
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
2918
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2919
+
2920
+ *x_ql = tile_x_ql;
2921
+ *x_dm = tile_x_dm;
2922
+ *x_sc = tile_x_sc;
2923
+ }
2924
+
2925
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2926
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2927
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2928
+
2929
+ __builtin_assume(i_offset >= 0);
2930
+ __builtin_assume(i_offset < nwarps);
2931
+ __builtin_assume(k >= 0);
2932
+ __builtin_assume(k < WARP_SIZE);
2933
+
2934
+ const int kbx = k / QI5_K; // == 0 if QK_K == 256
2935
+ const int kqsx = k % QI5_K; // == k if QK_K == 256
2936
+
2937
+ const block_q5_K * bx0 = (block_q5_K *) vx;
2938
+
2939
+ #pragma unroll
2940
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2941
+ int i = i0 + i_offset;
2942
+
2943
+ if (need_check) {
2944
+ i = min(i, i_max);
2945
+ }
2946
+
2947
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
2948
+ const int ky = QR5_K*kqsx;
2949
+
2950
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
2951
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
2952
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2953
+
2954
+ const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
2955
+ const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
2956
+ const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
2957
+
2958
+ const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
2959
+ const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
2960
+
2961
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
2962
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
2963
+ }
2964
+
2965
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
2966
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2967
+
2968
+ #pragma unroll
2969
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
2970
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
2971
+
2972
+ if (need_check) {
2973
+ i = min(i, i_max);
2974
+ }
2975
+
2976
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
2977
+
2978
+ x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
2979
+ }
2980
+
2981
+ #pragma unroll
2982
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
2983
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2984
+
2985
+ if (need_check) {
2986
+ i = min(i, i_max);
2987
+ }
2988
+
2989
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2990
+
2991
+ const int * scales = (int *) bxi->scales;
2992
+
2993
+ const int ksc = k % (WARP_SIZE/8);
2994
+
2995
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
2996
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
2997
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2998
+
2999
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
3000
+ }
3001
+ }
3002
+
3003
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3004
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3005
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3006
+
3007
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3008
+
3009
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
3010
+ const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
3011
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
3012
+ }
3013
+
3014
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3015
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
3016
+
3017
+ const block_q6_K * bq6_K = (const block_q6_K *) vbq;
3018
+
3019
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
3020
+ const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
3021
+ const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
3022
+
3023
+ const int vl = get_int_from_uint8(bq6_K->ql, iqs);
3024
+ const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
3025
+
3026
+ const int8_t * scales = bq6_K->scales + scale_offset;
3027
+
3028
+ int u[QR6_K];
3029
+ float d8[QR6_K];
3030
+
3031
+ #pragma unroll
3032
+ for (int i = 0; i < QR6_K; ++i) {
3033
+ u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
3034
+ d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
3035
+ }
3036
+
3037
+ return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
3038
+ }
3039
+
3040
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3041
+
3042
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3043
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
3044
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
3045
+
3046
+ *x_ql = tile_x_ql;
3047
+ *x_dm = tile_x_dm;
3048
+ *x_sc = tile_x_sc;
3049
+ }
3050
+
3051
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3052
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3053
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3054
+
3055
+ __builtin_assume(i_offset >= 0);
3056
+ __builtin_assume(i_offset < nwarps);
3057
+ __builtin_assume(k >= 0);
3058
+ __builtin_assume(k < WARP_SIZE);
3059
+
3060
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
3061
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
3062
+
3063
+ const block_q6_K * bx0 = (block_q6_K *) vx;
3064
+
3065
+ #pragma unroll
3066
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
3067
+ int i = i0 + i_offset;
3068
+
3069
+ if (need_check) {
3070
+ i = min(i, i_max);
3071
+ }
3072
+
3073
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
3074
+ const int ky = QR6_K*kqsx;
3075
+
3076
+ const int ql = get_int_from_uint8(bxi->ql, kqsx);
3077
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
3078
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
3079
+
3080
+ const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
3081
+ const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
3082
+ const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
3083
+
3084
+ const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
3085
+ const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
3086
+
3087
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
3088
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
3089
+ }
3090
+
3091
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
3092
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
3093
+ float * x_dmf = (float *) x_dm;
3094
+
3095
+ #pragma unroll
3096
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
3097
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
3098
+
3099
+ if (need_check) {
3100
+ i = min(i, i_max);
3101
+ }
3102
+
3103
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
3104
+
3105
+ x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
3106
+ }
3107
+
3108
+ #pragma unroll
3109
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
3110
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
3111
+
3112
+ if (need_check) {
3113
+ i = min(i, i_max);
3114
+ }
3115
+
3116
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
3117
+
3118
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
3119
+ }
3120
+ }
3121
+
3122
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3123
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3124
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3125
+
3126
+ const float * x_dmf = (const float *) x_dm;
3127
+ const float * y_df = (const float *) y_ds;
3128
+
3129
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
3130
+
3131
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
3132
+ const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
3133
+ return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
3134
+ }
3135
+
3136
+ template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3137
+ allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3138
+ static __global__ void mul_mat_q(
3139
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3140
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3141
+
3142
+ const block_q_t * x = (const block_q_t *) vx;
3143
+ const block_q8_1 * y = (const block_q8_1 *) vy;
3144
+
3145
+ const int blocks_per_row_x = ncols_x / qk;
3146
+ const int blocks_per_col_y = nrows_y / QK8_1;
3147
+ const int blocks_per_warp = WARP_SIZE / qi;
3148
+
3149
+ const int & ncols_dst = ncols_y;
3150
+
3151
+ const int row_dst_0 = blockIdx.x*mmq_y;
3152
+ const int & row_x_0 = row_dst_0;
3153
+ const int row_dst = row_dst_0 + threadIdx.x;
3154
+
3155
+ const int col_dst_0 = blockIdx.y*mmq_x;
3156
+ const int & col_y_0 = col_dst_0;
3157
+
3158
+ int * tile_x_ql = nullptr;
3159
+ half2 * tile_x_dm = nullptr;
3160
+ int * tile_x_qh = nullptr;
3161
+ int * tile_x_sc = nullptr;
3162
+
3163
+ allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
3164
+
3165
+ __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3166
+ __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
1767
3167
 
1768
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1769
- const block_q6_K * bq6_K = (const block_q6_K *) vbq;
3168
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
1770
3169
 
1771
- const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1772
- const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1773
- const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
3170
+ for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
1774
3171
 
1775
- float sumf = 0.0f;
3172
+ load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
3173
+ threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
3174
+
3175
+ #pragma unroll
3176
+ for (int ir = 0; ir < qr; ++ir) {
3177
+ const int kqs = ir*WARP_SIZE + threadIdx.x;
3178
+ const int kbxd = kqs / QI8_1;
1776
3179
 
1777
- const float d = bq6_K->d;
3180
+ #pragma unroll
3181
+ for (int i = 0; i < mmq_x; i += nwarps) {
3182
+ const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
1778
3183
 
1779
- int vl;
1780
- memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
3184
+ const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
1781
3185
 
1782
- int vh;
1783
- memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
3186
+ const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
3187
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
3188
+ }
1784
3189
 
1785
- for (int i = 0; i < QR6_K; ++i) {
1786
- const int sc = bq6_K->scales[scale_offset + 4*i];
3190
+ #pragma unroll
3191
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
3192
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
3193
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
3194
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3195
+
3196
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
3197
+ const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
3198
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
3199
+ if (need_sum) {
3200
+ *dsi_dst = *dsi_src;
3201
+ } else {
3202
+ float * dfi_dst = (float *) dsi_dst;
3203
+ *dfi_dst = (*dsi_src).x;
3204
+ }
3205
+ }
1787
3206
 
1788
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
1789
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
1790
- const float d8i = bq8i->d;
3207
+ __syncthreads();
1791
3208
 
1792
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
3209
+ // #pragma unroll // unrolling this loop causes too much register pressure
3210
+ for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
3211
+ #pragma unroll
3212
+ for (int j = 0; j < mmq_x; j += nwarps) {
3213
+ #pragma unroll
3214
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3215
+ sum[i/WARP_SIZE][j/nwarps] += vec_dot(
3216
+ tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3217
+ threadIdx.x + i, threadIdx.y + j, k);
3218
+ }
3219
+ }
3220
+ }
1793
3221
 
1794
- const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
3222
+ __syncthreads();
3223
+ }
3224
+ }
1795
3225
 
1796
- const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1797
3226
 
1798
- sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
3227
+ if (row_dst >= nrows_dst) {
3228
+ return;
1799
3229
  }
1800
3230
 
1801
- return d*sumf;
1802
- #else
1803
- return 0.0f; // only to satisfy the compiler
1804
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3231
+ for (int j = 0; j < mmq_x; j += nwarps) {
3232
+ const int col_dst = col_dst_0 + j + threadIdx.y;
3233
+
3234
+ if (col_dst >= ncols_dst) {
3235
+ return;
3236
+ }
3237
+
3238
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3239
+ dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
3240
+ }
3241
+ }
1805
3242
  }
1806
3243
 
1807
- template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
3244
+ template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
1808
3245
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1809
3246
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1810
3247
 
@@ -1813,7 +3250,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1813
3250
  }
1814
3251
 
1815
3252
  const int blocks_per_row = ncols / qk;
1816
- const int blocks_per_warp = WARP_SIZE / qi;
3253
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
1817
3254
 
1818
3255
  // partial sum for each thread
1819
3256
  float tmp = 0.0f;
@@ -1822,11 +3259,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
1822
3259
  const block_q8_1 * y = (const block_q8_1 *) vy;
1823
3260
 
1824
3261
  for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1825
- const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
3262
+ const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
1826
3263
 
1827
- const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
3264
+ const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
1828
3265
 
1829
- const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
3266
+ const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
1830
3267
 
1831
3268
  tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
1832
3269
  }
@@ -1859,11 +3296,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1859
3296
  const int y_offset = qr == 1 ? 1 : qk/2;
1860
3297
 
1861
3298
  // partial sum for each thread
1862
- #ifdef GGML_CUDA_DMMV_F16
3299
+ #ifdef GGML_CUDA_F16
1863
3300
  half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
1864
3301
  #else
1865
3302
  float tmp = 0.0f;
1866
- #endif // GGML_CUDA_DMMV_F16
3303
+ #endif // GGML_CUDA_F16
1867
3304
 
1868
3305
  for (int i = 0; i < ncols; i += iter_stride) {
1869
3306
  const int col = i + vals_per_iter*tid;
@@ -1883,7 +3320,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1883
3320
 
1884
3321
  // matrix multiplication
1885
3322
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
1886
- #ifdef GGML_CUDA_DMMV_F16
3323
+ #ifdef GGML_CUDA_F16
1887
3324
  tmp += __hmul2(v, {
1888
3325
  y[iybs + iqs + j/qr + 0],
1889
3326
  y[iybs + iqs + j/qr + y_offset]
@@ -1891,7 +3328,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1891
3328
  #else
1892
3329
  tmp += v.x * y[iybs + iqs + j/qr + 0];
1893
3330
  tmp += v.y * y[iybs + iqs + j/qr + y_offset];
1894
- #endif // GGML_CUDA_DMMV_F16
3331
+ #endif // GGML_CUDA_F16
1895
3332
  }
1896
3333
  }
1897
3334
 
@@ -1902,11 +3339,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1902
3339
  }
1903
3340
 
1904
3341
  if (tid == 0) {
1905
- #ifdef GGML_CUDA_DMMV_F16
3342
+ #ifdef GGML_CUDA_F16
1906
3343
  dst[row] = tmp.x + tmp.y;
1907
3344
  #else
1908
3345
  dst[row] = tmp;
1909
- #endif // GGML_CUDA_DMMV_F16
3346
+ #endif // GGML_CUDA_F16
1910
3347
  }
1911
3348
  }
1912
3349
 
@@ -2046,7 +3483,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
2046
3483
  }
2047
3484
 
2048
3485
  // rope == RoPE == rotary positional embedding
2049
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
3486
+ static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3487
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
2050
3488
  const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
2051
3489
 
2052
3490
  if (col >= ncols) {
@@ -2056,7 +3494,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
2056
3494
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
2057
3495
  const int i = row*ncols + col;
2058
3496
 
2059
- const float theta = p*powf(theta_scale, col/2);
3497
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
2060
3498
  const float sin_theta = sinf(theta);
2061
3499
  const float cos_theta = cosf(theta);
2062
3500
 
@@ -2203,9 +3641,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
2203
3641
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
2204
3642
  }
2205
3643
 
2206
- static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
2207
- const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
2208
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
3644
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
3645
+ const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
3646
+ const dim3 num_blocks(block_num_x, ky, 1);
3647
+ const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
3648
+ quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
2209
3649
  }
2210
3650
 
2211
3651
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2366,7 +3806,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
2366
3806
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2367
3807
  const dim3 block_nums(1, block_num_y, 1);
2368
3808
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2369
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
3809
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
2370
3810
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2371
3811
  }
2372
3812
 
@@ -2375,7 +3815,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
2375
3815
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2376
3816
  const dim3 block_nums(1, block_num_y, 1);
2377
3817
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2378
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
3818
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
2379
3819
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2380
3820
  }
2381
3821
 
@@ -2384,7 +3824,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
2384
3824
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2385
3825
  const dim3 block_nums(1, block_num_y, 1);
2386
3826
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2387
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
3827
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
2388
3828
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2389
3829
  }
2390
3830
 
@@ -2393,7 +3833,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
2393
3833
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2394
3834
  const dim3 block_nums(1, block_num_y, 1);
2395
3835
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2396
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
3836
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
2397
3837
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2398
3838
  }
2399
3839
 
@@ -2402,7 +3842,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
2402
3842
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2403
3843
  const dim3 block_nums(1, block_num_y, 1);
2404
3844
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2405
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
3845
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
2406
3846
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2407
3847
  }
2408
3848
 
@@ -2411,7 +3851,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
2411
3851
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2412
3852
  const dim3 block_nums(1, block_num_y, 1);
2413
3853
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2414
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
3854
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
2415
3855
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2416
3856
  }
2417
3857
 
@@ -2420,7 +3860,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
2420
3860
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2421
3861
  const dim3 block_nums(1, block_num_y, 1);
2422
3862
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2423
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
3863
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
2424
3864
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2425
3865
  }
2426
3866
 
@@ -2429,10 +3869,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
2429
3869
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2430
3870
  const dim3 block_nums(1, block_num_y, 1);
2431
3871
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2432
- // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
2433
- // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2434
- // is better amortized.
2435
- mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
3872
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
2436
3873
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2437
3874
  }
2438
3875
 
@@ -2441,10 +3878,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
2441
3878
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2442
3879
  const dim3 block_nums(1, block_num_y, 1);
2443
3880
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2444
- // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
2445
- // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2446
- // is better amortized.
2447
- mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
3881
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
2448
3882
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2449
3883
  }
2450
3884
 
@@ -2453,7 +3887,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
2453
3887
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2454
3888
  const dim3 block_nums(1, block_num_y, 1);
2455
3889
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2456
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
3890
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
2457
3891
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2458
3892
  }
2459
3893
 
@@ -2500,6 +3934,537 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
2500
3934
  }
2501
3935
  }
2502
3936
 
3937
+ static void ggml_mul_mat_q4_0_q8_1_cuda(
3938
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3939
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3940
+
3941
+ int id;
3942
+ CUDA_CHECK(cudaGetDevice(&id));
3943
+ const int compute_capability = g_compute_capabilities[id];
3944
+
3945
+ if (compute_capability >= CC_TURING) {
3946
+ const int mmq_x = 64;
3947
+ const int mmq_y = 128;
3948
+ const int nwarps = 4;
3949
+
3950
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3951
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3952
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3953
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3954
+
3955
+ if (nrows_x % mmq_y == 0) {
3956
+ const bool need_check = false;
3957
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3958
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3959
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3960
+ } else {
3961
+ const bool need_check = true;
3962
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3963
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3964
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3965
+ }
3966
+ } else {
3967
+ const int mmq_x = 64;
3968
+ const int mmq_y = 64;
3969
+ const int nwarps = 4;
3970
+
3971
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3972
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3973
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3974
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3975
+
3976
+ if (nrows_x % mmq_y == 0) {
3977
+ const bool need_check = false;
3978
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3979
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3980
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3981
+ } else {
3982
+ const bool need_check = true;
3983
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3984
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3985
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
+ }
3987
+ }
3988
+ }
3989
+
3990
+ static void ggml_mul_mat_q4_1_q8_1_cuda(
3991
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3992
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3993
+
3994
+ int id;
3995
+ CUDA_CHECK(cudaGetDevice(&id));
3996
+ const int compute_capability = g_compute_capabilities[id];
3997
+
3998
+ if (compute_capability >= CC_TURING) {
3999
+ const int mmq_x = 64;
4000
+ const int mmq_y = 128;
4001
+ const int nwarps = 4;
4002
+
4003
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4004
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4005
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4006
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4007
+
4008
+ if (nrows_x % mmq_y == 0) {
4009
+ const bool need_check = false;
4010
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4011
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4012
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4013
+ } else {
4014
+ const bool need_check = true;
4015
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4016
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4017
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4018
+ }
4019
+ } else {
4020
+ const int mmq_x = 64;
4021
+ const int mmq_y = 64;
4022
+ const int nwarps = 8;
4023
+
4024
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4025
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4026
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4027
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4028
+
4029
+ if (nrows_x % mmq_y == 0) {
4030
+ const bool need_check = false;
4031
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4032
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4033
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4034
+ } else {
4035
+ const bool need_check = true;
4036
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4037
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4038
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4039
+ }
4040
+
4041
+ }
4042
+ }
4043
+
4044
+ static void ggml_mul_mat_q5_0_q8_1_cuda(
4045
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4046
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4047
+
4048
+ int id;
4049
+ CUDA_CHECK(cudaGetDevice(&id));
4050
+ const int compute_capability = g_compute_capabilities[id];
4051
+
4052
+ if (compute_capability >= CC_TURING) {
4053
+ const int mmq_x = 128;
4054
+ const int mmq_y = 64;
4055
+ const int nwarps = 4;
4056
+
4057
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4058
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4059
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4060
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4061
+
4062
+ if (nrows_x % mmq_y == 0) {
4063
+ const bool need_check = false;
4064
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4065
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4066
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4067
+ } else {
4068
+ const bool need_check = true;
4069
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4070
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4071
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4072
+ }
4073
+ } else {
4074
+ const int mmq_x = 64;
4075
+ const int mmq_y = 64;
4076
+ const int nwarps = 8;
4077
+
4078
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4079
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4080
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4081
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4082
+
4083
+ if (nrows_x % mmq_y == 0) {
4084
+ const bool need_check = false;
4085
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4086
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4087
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4088
+ } else {
4089
+ const bool need_check = true;
4090
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4091
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4092
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4093
+ }
4094
+ }
4095
+ }
4096
+
4097
+ static void ggml_mul_mat_q5_1_q8_1_cuda(
4098
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4099
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4100
+
4101
+ int id;
4102
+ CUDA_CHECK(cudaGetDevice(&id));
4103
+ const int compute_capability = g_compute_capabilities[id];
4104
+
4105
+ if (compute_capability >= CC_TURING) {
4106
+ const int mmq_x = 128;
4107
+ const int mmq_y = 64;
4108
+ const int nwarps = 8;
4109
+
4110
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4111
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4112
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4113
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4114
+
4115
+ if (nrows_x % mmq_y == 0) {
4116
+ const bool need_check = false;
4117
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4118
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4119
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4120
+ } else {
4121
+ const bool need_check = true;
4122
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4123
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4124
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4125
+ }
4126
+ } else {
4127
+ const int mmq_x = 64;
4128
+ const int mmq_y = 64;
4129
+ const int nwarps = 8;
4130
+
4131
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4132
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4133
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4134
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4135
+
4136
+ if (nrows_x % mmq_y == 0) {
4137
+ const bool need_check = false;
4138
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4139
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4140
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4141
+ } else {
4142
+ const bool need_check = true;
4143
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4144
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4145
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4146
+ }
4147
+ }
4148
+ }
4149
+
4150
+ static void ggml_mul_mat_q8_0_q8_1_cuda(
4151
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4152
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4153
+
4154
+ int id;
4155
+ CUDA_CHECK(cudaGetDevice(&id));
4156
+ const int compute_capability = g_compute_capabilities[id];
4157
+
4158
+ if (compute_capability >= CC_TURING) {
4159
+ const int mmq_x = 128;
4160
+ const int mmq_y = 64;
4161
+ const int nwarps = 4;
4162
+
4163
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4164
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4165
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4166
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4167
+
4168
+ if (nrows_x % mmq_y == 0) {
4169
+ const bool need_check = false;
4170
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4171
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4172
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4173
+ } else {
4174
+ const bool need_check = true;
4175
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4176
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4177
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4178
+ }
4179
+ } else {
4180
+ const int mmq_x = 64;
4181
+ const int mmq_y = 64;
4182
+ const int nwarps = 8;
4183
+
4184
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4185
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4186
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4187
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4188
+
4189
+ if (nrows_x % mmq_y == 0) {
4190
+ const bool need_check = false;
4191
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4192
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4193
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4194
+ } else {
4195
+ const bool need_check = true;
4196
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4197
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4198
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4199
+ }
4200
+ }
4201
+ }
4202
+
4203
+ static void ggml_mul_mat_q2_K_q8_1_cuda(
4204
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4205
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4206
+
4207
+ int id;
4208
+ CUDA_CHECK(cudaGetDevice(&id));
4209
+ const int compute_capability = g_compute_capabilities[id];
4210
+
4211
+ if (compute_capability >= CC_TURING) {
4212
+ const int mmq_x = 64;
4213
+ const int mmq_y = 128;
4214
+ const int nwarps = 4;
4215
+
4216
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4217
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4218
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4219
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4220
+
4221
+ if (nrows_x % mmq_y == 0) {
4222
+ const bool need_check = false;
4223
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4224
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4225
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4226
+ } else {
4227
+ const bool need_check = true;
4228
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4229
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4230
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4231
+ }
4232
+ } else {
4233
+ const int mmq_x = 64;
4234
+ const int mmq_y = 64;
4235
+ const int nwarps = 8;
4236
+
4237
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4238
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4239
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4240
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4241
+
4242
+ if (nrows_x % mmq_y == 0) {
4243
+ const bool need_check = false;
4244
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4245
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4246
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4247
+ } else {
4248
+ const bool need_check = true;
4249
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4250
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4251
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4252
+ }
4253
+ }
4254
+ }
4255
+
4256
+ static void ggml_mul_mat_q3_K_q8_1_cuda(
4257
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4258
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4259
+
4260
+ int id;
4261
+ CUDA_CHECK(cudaGetDevice(&id));
4262
+ const int compute_capability = g_compute_capabilities[id];
4263
+
4264
+ if (compute_capability >= CC_TURING) {
4265
+ const int mmq_x = 128;
4266
+ const int mmq_y = 128;
4267
+ const int nwarps = 4;
4268
+
4269
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4270
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4271
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4272
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4273
+
4274
+ if (nrows_x % mmq_y == 0) {
4275
+ const bool need_check = false;
4276
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4277
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4278
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4279
+ } else {
4280
+ const bool need_check = true;
4281
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4282
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4283
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4284
+ }
4285
+ } else {
4286
+ const int mmq_x = 64;
4287
+ const int mmq_y = 64;
4288
+ const int nwarps = 8;
4289
+
4290
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4291
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4292
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4293
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4294
+
4295
+ if (nrows_x % mmq_y == 0) {
4296
+ const bool need_check = false;
4297
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4298
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4299
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4300
+ } else {
4301
+ const bool need_check = true;
4302
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4303
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4304
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4305
+ }
4306
+ }
4307
+ }
4308
+
4309
+ static void ggml_mul_mat_q4_K_q8_1_cuda(
4310
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4311
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4312
+
4313
+ int id;
4314
+ CUDA_CHECK(cudaGetDevice(&id));
4315
+ const int compute_capability = g_compute_capabilities[id];
4316
+
4317
+ if (compute_capability >= CC_TURING) {
4318
+ const int mmq_x = 64;
4319
+ const int mmq_y = 128;
4320
+ const int nwarps = 4;
4321
+
4322
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4323
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4324
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4325
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4326
+
4327
+ if (nrows_x % mmq_y == 0) {
4328
+ const bool need_check = false;
4329
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4330
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4331
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4332
+ } else {
4333
+ const bool need_check = true;
4334
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4335
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4336
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4337
+ }
4338
+ } else {
4339
+ const int mmq_x = 32;
4340
+ const int mmq_y = 64;
4341
+ const int nwarps = 8;
4342
+
4343
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4344
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4345
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4346
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4347
+
4348
+ if (nrows_x % mmq_y == 0) {
4349
+ const bool need_check = false;
4350
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4351
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4352
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4353
+ } else {
4354
+ const bool need_check = true;
4355
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4356
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4357
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4358
+ }
4359
+ }
4360
+ }
4361
+
4362
+ static void ggml_mul_mat_q5_K_q8_1_cuda(
4363
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4364
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4365
+
4366
+ int id;
4367
+ CUDA_CHECK(cudaGetDevice(&id));
4368
+ const int compute_capability = g_compute_capabilities[id];
4369
+
4370
+ if (compute_capability >= CC_TURING) {
4371
+ const int mmq_x = 64;
4372
+ const int mmq_y = 128;
4373
+ const int nwarps = 4;
4374
+
4375
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4376
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4377
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4378
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4379
+
4380
+ if (nrows_x % mmq_y == 0) {
4381
+ const bool need_check = false;
4382
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4383
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4384
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4385
+ } else {
4386
+ const bool need_check = true;
4387
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4388
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4389
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4390
+ }
4391
+ } else {
4392
+ const int mmq_x = 64;
4393
+ const int mmq_y = 64;
4394
+ const int nwarps = 8;
4395
+
4396
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4398
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4399
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4400
+
4401
+ if (nrows_x % mmq_y == 0) {
4402
+ const bool need_check = false;
4403
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4404
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4405
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4406
+ } else {
4407
+ const bool need_check = true;
4408
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4409
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4410
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4411
+ }
4412
+ }
4413
+ }
4414
+
4415
+ static void ggml_mul_mat_q6_K_q8_1_cuda(
4416
+ const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4417
+ const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4418
+
4419
+ int id;
4420
+ CUDA_CHECK(cudaGetDevice(&id));
4421
+ const int compute_capability = g_compute_capabilities[id];
4422
+
4423
+ if (compute_capability >= CC_TURING) {
4424
+ const int mmq_x = 64;
4425
+ const int mmq_y = 64;
4426
+ const int nwarps = 4;
4427
+
4428
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4429
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4430
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4431
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4432
+
4433
+ if (nrows_x % mmq_y == 0) {
4434
+ const bool need_check = false;
4435
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4436
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4437
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4438
+ } else {
4439
+ const bool need_check = true;
4440
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4441
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4442
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4443
+ }
4444
+ } else {
4445
+ const int mmq_x = 32;
4446
+ const int mmq_y = 64;
4447
+ const int nwarps = 8;
4448
+
4449
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4450
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4451
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4452
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4453
+
4454
+ if (nrows_x % mmq_y == 0) {
4455
+ const bool need_check = false;
4456
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4457
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4458
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4459
+ } else {
4460
+ const bool need_check = true;
4461
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4462
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4463
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4464
+ }
4465
+ }
4466
+ }
4467
+
2503
4468
  static void ggml_mul_mat_p021_f16_f32_cuda(
2504
4469
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
2505
4470
  const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
@@ -2544,12 +4509,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
2544
4509
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
2545
4510
  }
2546
4511
 
2547
- static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
4512
+ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4513
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
2548
4514
  GGML_ASSERT(nrows % 2 == 0);
2549
4515
  const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
2550
4516
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
2551
4517
  const dim3 block_nums(num_blocks_x, nrows, 1);
2552
- rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
4518
+ rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
2553
4519
  }
2554
4520
 
2555
4521
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
@@ -2670,21 +4636,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
2670
4636
  }
2671
4637
 
2672
4638
 
2673
- static void * g_scratch_buffer = nullptr;
2674
- static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
2675
- static size_t g_scratch_offset = 0;
2676
-
2677
- static int g_device_count = -1;
2678
- static int g_main_device = 0;
2679
- #ifndef GGML_CUDA_FORCE_DMMV
2680
- static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
2681
- #endif
2682
- static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
2683
-
2684
- static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
2685
-
2686
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
2687
-
2688
4639
  void ggml_init_cublas() {
2689
4640
  static bool initialized = false;
2690
4641
 
@@ -2701,9 +4652,7 @@ void ggml_init_cublas() {
2701
4652
  g_tensor_split[id] = total_vram;
2702
4653
  total_vram += prop.totalGlobalMem;
2703
4654
 
2704
- #ifndef GGML_CUDA_FORCE_DMMV
2705
4655
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
2706
- #endif
2707
4656
  }
2708
4657
  for (int id = 0; id < g_device_count; ++id) {
2709
4658
  g_tensor_split[id] /= total_vram;
@@ -2965,6 +4914,114 @@ inline void ggml_cuda_op_rms_norm(
2965
4914
  (void) i1;
2966
4915
  }
2967
4916
 
4917
+ inline void ggml_cuda_op_mul_mat_q(
4918
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
4919
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
4920
+ cudaStream_t & cudaStream_main){
4921
+
4922
+ GGML_ASSERT(src0_ddq_i != nullptr);
4923
+ GGML_ASSERT(src1_ddf_i != nullptr);
4924
+ GGML_ASSERT(dst_ddf_i != nullptr);
4925
+
4926
+ const int64_t ne00 = src0->ne[0];
4927
+
4928
+ const int64_t ne10 = src1->ne[0];
4929
+ const int64_t ne11 = src1->ne[1];
4930
+ GGML_ASSERT(ne10 % QK8_1 == 0);
4931
+
4932
+ const int64_t ne0 = dst->ne[0];
4933
+
4934
+ const int64_t i01_diff = i01_high - i01_low;
4935
+
4936
+ int id;
4937
+ CUDA_CHECK(cudaGetDevice(&id));
4938
+
4939
+ // the main device has a larger memory buffer to hold the results from all GPUs
4940
+ // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
4941
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
4942
+
4943
+ const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
4944
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
4945
+ size_t as;
4946
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
4947
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
4948
+
4949
+ switch (src0->type) {
4950
+ case GGML_TYPE_Q4_0:
4951
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4952
+ break;
4953
+ case GGML_TYPE_Q4_1:
4954
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4955
+ break;
4956
+ case GGML_TYPE_Q5_0:
4957
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4958
+ break;
4959
+ case GGML_TYPE_Q5_1:
4960
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4961
+ break;
4962
+ case GGML_TYPE_Q8_0:
4963
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4964
+ break;
4965
+ case GGML_TYPE_Q2_K:
4966
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4967
+ break;
4968
+ case GGML_TYPE_Q3_K:
4969
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4970
+ break;
4971
+ case GGML_TYPE_Q4_K:
4972
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4973
+ break;
4974
+ case GGML_TYPE_Q5_K:
4975
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4976
+ break;
4977
+ case GGML_TYPE_Q6_K:
4978
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
4979
+ break;
4980
+ default:
4981
+ GGML_ASSERT(false);
4982
+ break;
4983
+ }
4984
+
4985
+ ggml_cuda_pool_free(src1_q8_1, as);
4986
+
4987
+ (void) src1;
4988
+ (void) dst;
4989
+ (void) src0_ddf_i;
4990
+ (void) i02;
4991
+ (void) i1;
4992
+ }
4993
+
4994
+ static int64_t get_row_rounding(ggml_type type) {
4995
+ int max_compute_capability = INT_MIN;
4996
+ for (int id = 0; id < g_device_count; ++id) {
4997
+ if (max_compute_capability < g_compute_capabilities[id]
4998
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
4999
+ max_compute_capability = g_compute_capabilities[id];
5000
+ }
5001
+ }
5002
+
5003
+ switch(type) {
5004
+ case GGML_TYPE_Q4_0:
5005
+ case GGML_TYPE_Q4_1:
5006
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5007
+ case GGML_TYPE_Q5_0:
5008
+ case GGML_TYPE_Q5_1:
5009
+ case GGML_TYPE_Q8_0:
5010
+ return 64;
5011
+ case GGML_TYPE_F16:
5012
+ return 1;
5013
+ case GGML_TYPE_Q2_K:
5014
+ case GGML_TYPE_Q3_K:
5015
+ case GGML_TYPE_Q4_K:
5016
+ case GGML_TYPE_Q5_K:
5017
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5018
+ case GGML_TYPE_Q6_K:
5019
+ return 64;
5020
+ default:
5021
+ GGML_ASSERT(false);
5022
+ }
5023
+ }
5024
+
2968
5025
  inline void ggml_cuda_op_mul_mat_vec(
2969
5026
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2970
5027
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2979,6 +5036,7 @@ inline void ggml_cuda_op_mul_mat_vec(
2979
5036
 
2980
5037
  #ifdef GGML_CUDA_FORCE_DMMV
2981
5038
  const bool use_mul_mat_vec_q = false;
5039
+ (void) g_compute_capabilities[0];
2982
5040
  #else
2983
5041
  int id;
2984
5042
  CUDA_CHECK(cudaGetDevice(&id));
@@ -3006,7 +5064,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3006
5064
  ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
3007
5065
  size_t as;
3008
5066
  void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
3009
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
5067
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
3010
5068
 
3011
5069
  switch (src0->type) {
3012
5070
  case GGML_TYPE_Q4_0:
@@ -3047,7 +5105,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3047
5105
  ggml_cuda_pool_free(src1_q8_1, as);
3048
5106
  } else {
3049
5107
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
3050
- #ifdef GGML_CUDA_DMMV_F16
5108
+ #ifdef GGML_CUDA_F16
3051
5109
  size_t ash;
3052
5110
  dfloat * src1_dfloat = nullptr; // dfloat == half
3053
5111
 
@@ -3063,7 +5121,7 @@ inline void ggml_cuda_op_mul_mat_vec(
3063
5121
  }
3064
5122
  #else
3065
5123
  dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
3066
- #endif // GGML_CUDA_DMMV_F16
5124
+ #endif // GGML_CUDA_F16
3067
5125
 
3068
5126
  switch (src0->type) {
3069
5127
  case GGML_TYPE_Q4_0:
@@ -3104,11 +5162,11 @@ inline void ggml_cuda_op_mul_mat_vec(
3104
5162
  break;
3105
5163
  }
3106
5164
 
3107
- #ifdef GGML_CUDA_DMMV_F16
5165
+ #ifdef GGML_CUDA_F16
3108
5166
  if (src1_convert_f16) {
3109
5167
  ggml_cuda_pool_free(src1_dfloat, ash);
3110
5168
  }
3111
- #endif // GGML_CUDA_DMMV_F16
5169
+ #endif // GGML_CUDA_F16
3112
5170
  }
3113
5171
 
3114
5172
  (void) src1;
@@ -3168,6 +5226,7 @@ inline void ggml_cuda_op_rope(
3168
5226
  GGML_ASSERT(dst_ddf_i != nullptr);
3169
5227
 
3170
5228
  const int64_t ne00 = src0->ne[0];
5229
+ const int64_t ne01 = src0->ne[1];
3171
5230
  const int64_t i01_diff = i01_high - i01_low;
3172
5231
 
3173
5232
  const int n_past = ((int32_t *) dst->op_params)[0];
@@ -3181,17 +5240,18 @@ inline void ggml_cuda_op_rope(
3181
5240
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
3182
5241
 
3183
5242
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
3184
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3185
5243
 
3186
- bool is_glm = mode & 4;
5244
+ const bool is_glm = mode & 4;
3187
5245
 
3188
5246
  // compute
3189
5247
  if (is_glm) {
5248
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
3190
5249
  const float id_p = min(p, n_ctx - 2.f);
3191
5250
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
3192
5251
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
3193
5252
  } else {
3194
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
5253
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5254
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
3195
5255
  }
3196
5256
 
3197
5257
  (void) src1;
@@ -3362,8 +5422,17 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3362
5422
 
3363
5423
  int64_t row_low, row_high;
3364
5424
  if (split) {
5425
+ const int64_t rounding = get_row_rounding(src0->type);
5426
+
3365
5427
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
3366
- row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
5428
+ row_low -= row_low % rounding;
5429
+
5430
+ if (id == g_device_count - 1) {
5431
+ row_high = nrows0;
5432
+ } else {
5433
+ row_high = nrows0*g_tensor_split[id + 1];
5434
+ row_high -= row_high % rounding;
5435
+ }
3367
5436
  } else {
3368
5437
  row_low = 0;
3369
5438
  row_high = nrows0*i02_divisor;
@@ -3529,13 +5598,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3529
5598
  if (split) {
3530
5599
  // src0 = weight matrix is saved as a transposed matrix for better memory layout.
3531
5600
  // dst is NOT transposed.
3532
- // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
5601
+ // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
3533
5602
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
3534
5603
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
3535
- for (int64_t j = 0; j < ne1; ++j) {
3536
- float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
3537
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
3538
- }
5604
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
5605
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
5606
+ i01_diff*sizeof(float), ne1, kind, cudaStream_main));
3539
5607
  } else {
3540
5608
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
3541
5609
  CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
@@ -3576,7 +5644,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3576
5644
  if (split && g_device_count > 1) {
3577
5645
  CUDA_CHECK(cudaSetDevice(g_main_device));
3578
5646
  for (int id = 0; id < g_device_count; ++id) {
3579
- if (id != g_main_device) {
5647
+ if (id != g_main_device && src0_extra->events[id]) {
3580
5648
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
3581
5649
  }
3582
5650
  }
@@ -3718,7 +5786,19 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
3718
5786
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
3719
5787
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
3720
5788
  } else {
3721
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
5789
+ int min_compute_capability = INT_MAX;
5790
+ for (int id = 0; id < g_device_count; ++id) {
5791
+ if (min_compute_capability > g_compute_capabilities[id]
5792
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5793
+ min_compute_capability = g_compute_capabilities[id];
5794
+ }
5795
+ }
5796
+
5797
+ if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
5798
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
5799
+ } else {
5800
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
5801
+ }
3722
5802
  }
3723
5803
  } else {
3724
5804
  GGML_ASSERT(false);
@@ -3795,7 +5875,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
3795
5875
 
3796
5876
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3797
5877
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
3798
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
5878
+
5879
+ const int mode = ((int32_t *) dst->op_params)[2];
5880
+ const bool is_glm = mode & 4;
5881
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
3799
5882
  }
3800
5883
 
3801
5884
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3827,8 +5910,17 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3827
5910
  row_low = 0;
3828
5911
  row_high = nrows;
3829
5912
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
5913
+ const int64_t rounding = get_row_rounding(tensor->type);
5914
+
3830
5915
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
3831
- row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
5916
+ row_low -= row_low % rounding;
5917
+
5918
+ if (id == g_device_count - 1) {
5919
+ row_high = nrows;
5920
+ } else {
5921
+ row_high = nrows*g_tensor_split[id + 1];
5922
+ row_high -= row_high % rounding;
5923
+ }
3832
5924
  } else {
3833
5925
  GGML_ASSERT(false);
3834
5926
  }
@@ -4002,6 +6094,10 @@ void ggml_cuda_set_main_device(int main_device) {
4002
6094
  }
4003
6095
  }
4004
6096
 
6097
+ void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
6098
+ g_mul_mat_q = mul_mat_q;
6099
+ }
6100
+
4005
6101
  void ggml_cuda_set_scratch_size(size_t scratch_size) {
4006
6102
  g_scratch_size = scratch_size;
4007
6103
  }