llama_cpp 0.3.5 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2090 -438
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +17 -16
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +49 -26
- data/ext/llama_cpp/src/ggml.h +12 -1
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama.cpp +199 -68
- data/ext/llama_cpp/src/llama.h +1 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
@@ -52,13 +52,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
52
52
|
} while (0)
|
53
53
|
#endif // CUDART_VERSION >= 11
|
54
54
|
|
55
|
-
#ifdef
|
55
|
+
#ifdef GGML_CUDA_F16
|
56
56
|
typedef half dfloat; // dequantize float
|
57
57
|
typedef half2 dfloat2;
|
58
58
|
#else
|
59
59
|
typedef float dfloat; // dequantize float
|
60
60
|
typedef float2 dfloat2;
|
61
|
-
#endif //
|
61
|
+
#endif //GGML_CUDA_F16
|
62
|
+
|
63
|
+
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
64
|
+
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
65
|
+
|
66
|
+
int x32 = 0;
|
67
|
+
x32 |= x16[0] << 0;
|
68
|
+
x32 |= x16[1] << 16;
|
69
|
+
|
70
|
+
return x32;
|
71
|
+
}
|
72
|
+
|
73
|
+
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
74
|
+
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
75
|
+
|
76
|
+
int x32 = 0;
|
77
|
+
x32 |= x16[0] << 0;
|
78
|
+
x32 |= x16[1] << 16;
|
79
|
+
|
80
|
+
return x32;
|
81
|
+
}
|
82
|
+
|
83
|
+
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
84
|
+
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
85
|
+
}
|
86
|
+
|
87
|
+
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
88
|
+
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
89
|
+
}
|
62
90
|
|
63
91
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
64
92
|
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
@@ -87,8 +115,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
87
115
|
#define QR4_1 2
|
88
116
|
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
89
117
|
typedef struct {
|
90
|
-
|
91
|
-
half m; // min
|
118
|
+
half2 dm; // dm.x = delta, dm.y = min
|
92
119
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
93
120
|
} block_q4_1;
|
94
121
|
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
@@ -107,8 +134,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
107
134
|
#define QR5_1 2
|
108
135
|
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
109
136
|
typedef struct {
|
110
|
-
|
111
|
-
half m; // min
|
137
|
+
half2 dm; // dm.x = delta, dm.y = min
|
112
138
|
uint8_t qh[4]; // 5-th bit of quants
|
113
139
|
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
114
140
|
} block_q5_1;
|
@@ -127,13 +153,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
127
153
|
#define QR8_1 1
|
128
154
|
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
129
155
|
typedef struct {
|
130
|
-
|
131
|
-
half s; // unquantized sum
|
156
|
+
half2 ds; // ds.x = delta, ds.y = sum
|
132
157
|
int8_t qs[QK8_0]; // quants
|
133
158
|
} block_q8_1;
|
134
159
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
135
160
|
|
136
|
-
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
161
|
+
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
162
|
+
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
|
163
|
+
typedef void (*load_tiles_cuda_t)(
|
164
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
165
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
|
166
|
+
typedef float (*vec_dot_q_mul_mat_cuda_t)(
|
167
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
168
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
|
137
169
|
|
138
170
|
//================================= k-quants
|
139
171
|
|
@@ -150,8 +182,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
|
150
182
|
typedef struct {
|
151
183
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
152
184
|
uint8_t qs[QK_K/4]; // quants
|
153
|
-
|
154
|
-
half dmin; // super-block scale for quantized mins
|
185
|
+
half2 dm; // super-block scale for quantized scales/mins
|
155
186
|
} block_q2_K;
|
156
187
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
157
188
|
|
@@ -180,8 +211,7 @@ typedef struct {
|
|
180
211
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
181
212
|
#else
|
182
213
|
typedef struct {
|
183
|
-
|
184
|
-
half dmin; // super-block scale for quantized mins
|
214
|
+
half2 dm; // super-block scale for quantized scales/mins
|
185
215
|
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
186
216
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
187
217
|
} block_q4_K;
|
@@ -200,11 +230,10 @@ typedef struct {
|
|
200
230
|
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
201
231
|
#else
|
202
232
|
typedef struct {
|
203
|
-
|
204
|
-
|
205
|
-
uint8_t
|
206
|
-
uint8_t
|
207
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
233
|
+
half2 dm; // super-block scale for quantized scales/mins
|
234
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
235
|
+
uint8_t qh[QK_K/8]; // quants, high bit
|
236
|
+
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
208
237
|
} block_q5_K;
|
209
238
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
210
239
|
#endif
|
@@ -233,6 +262,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
233
262
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
234
263
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
235
264
|
|
265
|
+
#ifndef GGML_CUDA_MMQ_Y
|
266
|
+
#define GGML_CUDA_MMQ_Y 64
|
267
|
+
#endif // GGML_CUDA_MMQ_Y
|
268
|
+
|
236
269
|
// dmmv = dequantize_mul_mat_vec
|
237
270
|
#ifndef GGML_CUDA_DMMV_X
|
238
271
|
#define GGML_CUDA_DMMV_X 32
|
@@ -367,33 +400,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
367
400
|
v.x = vui & 0xF;
|
368
401
|
v.y = vui >> 4;
|
369
402
|
|
370
|
-
#ifdef
|
403
|
+
#ifdef GGML_CUDA_F16
|
371
404
|
v = __hsub2(v, {8.0f, 8.0f});
|
372
405
|
v = __hmul2(v, {d, d});
|
373
406
|
#else
|
374
407
|
v.x = (v.x - 8.0f) * d;
|
375
408
|
v.y = (v.y - 8.0f) * d;
|
376
|
-
#endif //
|
409
|
+
#endif // GGML_CUDA_F16
|
377
410
|
}
|
378
411
|
|
379
412
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
380
413
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
381
414
|
|
382
|
-
const dfloat d = x[ib].
|
383
|
-
const dfloat m = x[ib].
|
415
|
+
const dfloat d = x[ib].dm.x;
|
416
|
+
const dfloat m = x[ib].dm.y;
|
384
417
|
|
385
418
|
const int vui = x[ib].qs[iqs];
|
386
419
|
|
387
420
|
v.x = vui & 0xF;
|
388
421
|
v.y = vui >> 4;
|
389
422
|
|
390
|
-
#ifdef
|
423
|
+
#ifdef GGML_CUDA_F16
|
391
424
|
v = __hmul2(v, {d, d});
|
392
425
|
v = __hadd2(v, {m, m});
|
393
426
|
#else
|
394
427
|
v.x = (v.x * d) + m;
|
395
428
|
v.y = (v.y * d) + m;
|
396
|
-
#endif //
|
429
|
+
#endif // GGML_CUDA_F16
|
397
430
|
}
|
398
431
|
|
399
432
|
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
@@ -410,20 +443,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
410
443
|
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
411
444
|
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
412
445
|
|
413
|
-
#ifdef
|
446
|
+
#ifdef GGML_CUDA_F16
|
414
447
|
v = __hsub2(v, {16.0f, 16.0f});
|
415
448
|
v = __hmul2(v, {d, d});
|
416
449
|
#else
|
417
450
|
v.x = (v.x - 16.0f) * d;
|
418
451
|
v.y = (v.y - 16.0f) * d;
|
419
|
-
#endif //
|
452
|
+
#endif // GGML_CUDA_F16
|
420
453
|
}
|
421
454
|
|
422
455
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
423
456
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
424
457
|
|
425
|
-
const dfloat d = x[ib].
|
426
|
-
const dfloat m = x[ib].
|
458
|
+
const dfloat d = x[ib].dm.x;
|
459
|
+
const dfloat m = x[ib].dm.y;
|
427
460
|
|
428
461
|
uint32_t qh;
|
429
462
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
@@ -434,13 +467,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
|
434
467
|
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
435
468
|
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
436
469
|
|
437
|
-
#ifdef
|
470
|
+
#ifdef GGML_CUDA_F16
|
438
471
|
v = __hmul2(v, {d, d});
|
439
472
|
v = __hadd2(v, {m, m});
|
440
473
|
#else
|
441
474
|
v.x = (v.x * d) + m;
|
442
475
|
v.y = (v.y * d) + m;
|
443
|
-
#endif //
|
476
|
+
#endif // GGML_CUDA_F16
|
444
477
|
}
|
445
478
|
|
446
479
|
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
@@ -451,12 +484,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
451
484
|
v.x = x[ib].qs[iqs + 0];
|
452
485
|
v.y = x[ib].qs[iqs + 1];
|
453
486
|
|
454
|
-
#ifdef
|
487
|
+
#ifdef GGML_CUDA_F16
|
455
488
|
v = __hmul2(v, {d, d});
|
456
489
|
#else
|
457
490
|
v.x *= d;
|
458
491
|
v.y *= d;
|
459
|
-
#endif //
|
492
|
+
#endif // GGML_CUDA_F16
|
460
493
|
}
|
461
494
|
|
462
495
|
//================================== k-quants
|
@@ -475,8 +508,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
475
508
|
const uint8_t q = x[i].qs[32*n + l];
|
476
509
|
float * y = yy + i*QK_K + 128*n;
|
477
510
|
|
478
|
-
float dall = x[i].
|
479
|
-
float dmin = x[i].
|
511
|
+
float dall = x[i].dm.x;
|
512
|
+
float dmin = x[i].dm.y;
|
480
513
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
481
514
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
482
515
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
@@ -486,8 +519,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
486
519
|
const int il = tid%16; // 0...15
|
487
520
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
488
521
|
float * y = yy + i*QK_K + 16*is + il;
|
489
|
-
float dall = x[i].
|
490
|
-
float dmin = x[i].
|
522
|
+
float dall = x[i].dm.x;
|
523
|
+
float dmin = x[i].dm.y;
|
491
524
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
492
525
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
493
526
|
#endif
|
@@ -573,8 +606,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
573
606
|
|
574
607
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
575
608
|
|
576
|
-
const float dall = x[i].
|
577
|
-
const float dmin = x[i].
|
609
|
+
const float dall = x[i].dm.x;
|
610
|
+
const float dmin = x[i].dm.y;
|
578
611
|
|
579
612
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
580
613
|
|
@@ -612,8 +645,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
612
645
|
|
613
646
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
614
647
|
|
615
|
-
const float dall = x[i].
|
616
|
-
const float dmin = x[i].
|
648
|
+
const float dall = x[i].dm.x;
|
649
|
+
const float dmin = x[i].dm.y;
|
617
650
|
|
618
651
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
619
652
|
const uint8_t * qh = x[i].qh + 2*ir;
|
@@ -725,8 +758,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
725
758
|
const float * y = yy + i * QK_K + y_offset;
|
726
759
|
const uint8_t * q = x[i].qs + q_offset;
|
727
760
|
|
728
|
-
const float dall = x[i].
|
729
|
-
const float dmin = x[i].
|
761
|
+
const float dall = x[i].dm.x;
|
762
|
+
const float dmin = x[i].dm.y;
|
730
763
|
|
731
764
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
732
765
|
aux[0] = a[0] & 0x0f0f0f0f;
|
@@ -768,9 +801,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
768
801
|
uaux[0] = s[0] & 0x0f0f0f0f;
|
769
802
|
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
770
803
|
|
771
|
-
const
|
772
|
-
|
773
|
-
const float2 dall = __half22float2(dh[0]);
|
804
|
+
const float2 dall = __half22float2(x[i].dm);
|
774
805
|
|
775
806
|
float sum1 = 0, sum2 = 0;
|
776
807
|
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
@@ -948,8 +979,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
948
979
|
const float * y1 = yy + i*QK_K + y_offset;
|
949
980
|
const float * y2 = y1 + 128;
|
950
981
|
|
951
|
-
const float dall = x[i].
|
952
|
-
const float dmin = x[i].
|
982
|
+
const float dall = x[i].dm.x;
|
983
|
+
const float dmin = x[i].dm.y;
|
953
984
|
|
954
985
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
955
986
|
aux[0] = a[im+0] & kmask1;
|
@@ -1081,8 +1112,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1081
1112
|
const float * y1 = yy + i*QK_K + y_offset;
|
1082
1113
|
const float * y2 = y1 + 128;
|
1083
1114
|
|
1084
|
-
const float dall = x[i].
|
1085
|
-
const float dmin = x[i].
|
1115
|
+
const float dall = x[i].dm.x;
|
1116
|
+
const float dmin = x[i].dm.y;
|
1086
1117
|
|
1087
1118
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1088
1119
|
aux[0] = a[im+0] & kmask1;
|
@@ -1270,19 +1301,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1270
1301
|
v.y = x[ib + iqs + 1];
|
1271
1302
|
}
|
1272
1303
|
|
1273
|
-
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int
|
1274
|
-
const int
|
1304
|
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
1305
|
+
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
1275
1306
|
|
1276
|
-
if (
|
1307
|
+
if (ix >= kx_padded) {
|
1277
1308
|
return;
|
1278
1309
|
}
|
1279
1310
|
|
1311
|
+
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
|
1312
|
+
|
1313
|
+
const int i_padded = iy*kx_padded + ix;
|
1314
|
+
|
1280
1315
|
block_q8_1 * y = (block_q8_1 *) vy;
|
1281
1316
|
|
1282
|
-
const int ib =
|
1283
|
-
const int iqs =
|
1317
|
+
const int ib = i_padded / QK8_1; // block index
|
1318
|
+
const int iqs = i_padded % QK8_1; // quant index
|
1284
1319
|
|
1285
|
-
const float xi =
|
1320
|
+
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
1286
1321
|
float amax = fabsf(xi);
|
1287
1322
|
float sum = xi;
|
1288
1323
|
|
@@ -1301,8 +1336,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1301
1336
|
return;
|
1302
1337
|
}
|
1303
1338
|
|
1304
|
-
y[ib].
|
1305
|
-
y[ib].
|
1339
|
+
y[ib].ds.x = d;
|
1340
|
+
y[ib].ds.y = sum;
|
1306
1341
|
}
|
1307
1342
|
|
1308
1343
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -1326,485 +1361,1816 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
|
|
1326
1361
|
y[iybs + iqs + y_offset] = v.y;
|
1327
1362
|
}
|
1328
1363
|
|
1329
|
-
|
1330
|
-
|
1331
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1332
|
-
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1364
|
+
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
1365
|
+
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
1333
1366
|
|
1334
|
-
|
1335
|
-
|
1336
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1337
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
1367
|
+
#define VDR_Q4_0_Q8_1_MMVQ 2
|
1368
|
+
#define VDR_Q4_0_Q8_1_MMQ 4
|
1338
1369
|
|
1339
|
-
|
1370
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
|
1371
|
+
const int * v, const int * u, const float & d4, const half2 & ds8) {
|
1372
|
+
|
1373
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1374
|
+
int sumi = 0;
|
1340
1375
|
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1376
|
+
#pragma unroll
|
1377
|
+
for (int i = 0; i < vdr; ++i) {
|
1378
|
+
const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
|
1379
|
+
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
1344
1380
|
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1381
|
+
// SIMD dot product of quantized values
|
1382
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi);
|
1383
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1384
|
+
}
|
1348
1385
|
|
1349
|
-
|
1386
|
+
// second part effectively subtracts 8 from each quant value
|
1387
|
+
return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y));
|
1350
1388
|
#else
|
1351
1389
|
return 0.0f; // only to satisfy the compiler
|
1352
1390
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1353
1391
|
}
|
1354
1392
|
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1393
|
+
#define VDR_Q4_1_Q8_1_MMVQ 2
|
1394
|
+
#define VDR_Q4_1_Q8_1_MMQ 4
|
1395
|
+
|
1396
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
|
1397
|
+
const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
|
1359
1398
|
|
1360
|
-
|
1361
|
-
|
1362
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
1399
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1400
|
+
int sumi = 0;
|
1363
1401
|
|
1364
|
-
|
1365
|
-
|
1366
|
-
|
1402
|
+
#pragma unroll
|
1403
|
+
for (int i = 0; i < vdr; ++i) {
|
1404
|
+
const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
|
1405
|
+
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
1367
1406
|
|
1368
|
-
|
1369
|
-
|
1407
|
+
// SIMD dot product of quantized values
|
1408
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi);
|
1409
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1410
|
+
}
|
1370
1411
|
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1412
|
+
#ifdef GGML_CUDA_F16
|
1413
|
+
const half2 tmp = __hmul2(dm4, ds8);
|
1414
|
+
const float d4d8 = __half2float(tmp.x);
|
1415
|
+
const float m4s8 = __half2float(tmp.y);
|
1416
|
+
#else
|
1417
|
+
const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
|
1418
|
+
const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
|
1419
|
+
#endif // GGML_CUDA_F16
|
1374
1420
|
|
1375
|
-
|
1421
|
+
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1422
|
+
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1376
1423
|
#else
|
1377
1424
|
return 0.0f; // only to satisfy the compiler
|
1378
1425
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1379
1426
|
}
|
1380
1427
|
|
1381
|
-
|
1382
|
-
|
1428
|
+
#define VDR_Q5_0_Q8_1_MMVQ 2
|
1429
|
+
#define VDR_Q5_0_Q8_1_MMQ 4
|
1430
|
+
|
1431
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
|
1432
|
+
const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
|
1433
|
+
|
1383
1434
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1384
|
-
|
1435
|
+
int sumi = 0;
|
1436
|
+
|
1437
|
+
for (int i = 0; i < vdr; ++i) {
|
1438
|
+
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1439
|
+
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
1440
|
+
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
1441
|
+
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
1442
|
+
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
1443
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
|
1444
|
+
|
1445
|
+
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
1446
|
+
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
1447
|
+
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
1448
|
+
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
1449
|
+
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
1450
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1451
|
+
}
|
1385
1452
|
|
1386
|
-
|
1387
|
-
|
1388
|
-
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1389
|
-
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1390
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1391
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
1392
|
-
|
1393
|
-
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
1394
|
-
|
1395
|
-
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1396
|
-
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1397
|
-
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1398
|
-
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1399
|
-
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1400
|
-
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
1401
|
-
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1402
|
-
|
1403
|
-
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1404
|
-
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1405
|
-
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1406
|
-
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1407
|
-
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1408
|
-
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
1409
|
-
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1410
|
-
|
1411
|
-
return sumi*d;
|
1453
|
+
// second part effectively subtracts 16 from each quant value
|
1454
|
+
return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y));
|
1412
1455
|
#else
|
1413
1456
|
return 0.0f; // only to satisfy the compiler
|
1414
1457
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1415
1458
|
}
|
1416
1459
|
|
1417
|
-
|
1418
|
-
|
1460
|
+
#define VDR_Q5_1_Q8_1_MMVQ 2
|
1461
|
+
#define VDR_Q5_1_Q8_1_MMQ 4
|
1462
|
+
|
1463
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
|
1464
|
+
const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
|
1465
|
+
|
1419
1466
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1420
|
-
|
1467
|
+
int sumi = 0;
|
1468
|
+
|
1469
|
+
for (int i = 0; i < vdr; ++i) {
|
1470
|
+
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1471
|
+
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
1472
|
+
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
1473
|
+
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
1474
|
+
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
1475
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
|
1476
|
+
|
1477
|
+
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
1478
|
+
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
1479
|
+
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
1480
|
+
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
1481
|
+
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
1482
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1483
|
+
}
|
1484
|
+
|
1485
|
+
#ifdef GGML_CUDA_F16
|
1486
|
+
const half2 tmp = __hmul2(dm5, ds8);
|
1487
|
+
const float d5d8 = __half2float(tmp.x);
|
1488
|
+
const float m5s8 = __half2float(tmp.y);
|
1489
|
+
#else
|
1490
|
+
const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
|
1491
|
+
const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
|
1492
|
+
#endif // GGML_CUDA_F16
|
1493
|
+
|
1494
|
+
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
1495
|
+
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1421
1496
|
|
1422
|
-
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
1423
|
-
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1424
|
-
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1425
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1426
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
1427
|
-
|
1428
|
-
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
1429
|
-
const float m = bq5_1->m;
|
1430
|
-
const float s = bq8_1->s;
|
1431
|
-
|
1432
|
-
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1433
|
-
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1434
|
-
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1435
|
-
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1436
|
-
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1437
|
-
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1438
|
-
|
1439
|
-
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1440
|
-
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1441
|
-
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1442
|
-
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1443
|
-
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1444
|
-
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1445
|
-
|
1446
|
-
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1447
1497
|
#else
|
1448
1498
|
return 0.0f; // only to satisfy the compiler
|
1449
1499
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1450
1500
|
}
|
1451
1501
|
|
1452
|
-
|
1453
|
-
|
1454
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1455
|
-
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1502
|
+
#define VDR_Q8_0_Q8_1_MMVQ 2
|
1503
|
+
#define VDR_Q8_0_Q8_1_MMQ 8
|
1456
1504
|
|
1457
|
-
|
1458
|
-
|
1459
|
-
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1505
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
|
1506
|
+
const int * v, const int * u, const float & d8_0, const half2 & ds8_1) {
|
1460
1507
|
|
1461
|
-
|
1508
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1509
|
+
int sumi = 0;
|
1462
1510
|
|
1463
|
-
|
1464
|
-
|
1511
|
+
for (int i = 0; i < vdr; ++i) {
|
1512
|
+
// SIMD dot product of quantized values
|
1513
|
+
sumi = __dp4a(v[i], u[i], sumi);
|
1514
|
+
}
|
1465
1515
|
|
1466
|
-
return sumi*
|
1516
|
+
return sumi * d8_0 * __half2float(ds8_1.x);
|
1467
1517
|
#else
|
1468
1518
|
return 0.0f; // only to satisfy the compiler
|
1469
1519
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1470
1520
|
}
|
1471
1521
|
|
1472
|
-
static __device__ __forceinline__ float
|
1473
|
-
const
|
1522
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
|
1523
|
+
const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
|
1474
1524
|
|
1475
1525
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1476
|
-
|
1526
|
+
int sumi = 0;
|
1477
1527
|
|
1478
|
-
|
1479
|
-
|
1528
|
+
for (int i = 0; i < vdr; ++i) {
|
1529
|
+
// SIMD dot product of quantized values
|
1530
|
+
sumi = __dp4a(v[i], u[i], sumi);
|
1531
|
+
}
|
1480
1532
|
|
1481
|
-
|
1482
|
-
|
1533
|
+
#ifdef GGML_CUDA_F16
|
1534
|
+
const half2 tmp = __hmul2(dm8, ds8);
|
1535
|
+
const float d8d8 = __half2float(tmp.x);
|
1536
|
+
const float m8s8 = __half2float(tmp.y);
|
1537
|
+
#else
|
1538
|
+
const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x);
|
1539
|
+
const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y);
|
1540
|
+
#endif // GGML_CUDA_F16
|
1483
1541
|
|
1484
|
-
|
1485
|
-
|
1542
|
+
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1543
|
+
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1544
|
+
#else
|
1545
|
+
return 0.0f; // only to satisfy the compiler
|
1546
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1547
|
+
}
|
1486
1548
|
|
1487
|
-
|
1549
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1550
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1488
1551
|
|
1489
|
-
|
1490
|
-
const int sc = bq2_K->scales[scale_offset + 2*i];
|
1552
|
+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1491
1553
|
|
1492
|
-
|
1493
|
-
|
1554
|
+
int v[VDR_Q4_0_Q8_1_MMVQ];
|
1555
|
+
int u[2*VDR_Q4_0_Q8_1_MMVQ];
|
1494
1556
|
|
1495
|
-
|
1496
|
-
|
1557
|
+
#pragma unroll
|
1558
|
+
for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
|
1559
|
+
v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
|
1560
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1561
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
|
1562
|
+
}
|
1563
|
+
|
1564
|
+
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
1565
|
+
}
|
1566
|
+
|
1567
|
+
static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1568
|
+
|
1569
|
+
__shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
1570
|
+
__shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
|
1571
|
+
|
1572
|
+
*x_ql = tile_x_qs;
|
1573
|
+
*x_dm = (half2 *) tile_x_d;
|
1574
|
+
}
|
1575
|
+
|
1576
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1577
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1578
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1579
|
+
|
1580
|
+
__builtin_assume(i_offset >= 0);
|
1581
|
+
__builtin_assume(i_offset < 8);
|
1582
|
+
__builtin_assume(k >= 0);
|
1583
|
+
__builtin_assume(k < WARP_SIZE);
|
1584
|
+
|
1585
|
+
const int kbx = k / QI4_0;
|
1586
|
+
const int kqsx = k % QI4_0;
|
1587
|
+
|
1588
|
+
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
1589
|
+
|
1590
|
+
float * x_dmf = (float *) x_dm;
|
1591
|
+
|
1592
|
+
#pragma unroll
|
1593
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1594
|
+
int i = i0 + i_offset;
|
1497
1595
|
|
1498
|
-
|
1499
|
-
|
1596
|
+
if (need_check) {
|
1597
|
+
i = min(i, i_max);
|
1598
|
+
}
|
1599
|
+
|
1600
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1601
|
+
|
1602
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
1603
|
+
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1500
1604
|
}
|
1501
1605
|
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1606
|
+
// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
|
1607
|
+
// const int kbxd = k % blocks_per_tile_x_row;
|
1608
|
+
|
1609
|
+
// #pragma unroll
|
1610
|
+
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
|
1611
|
+
// FIXME out-of-bounds
|
1612
|
+
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1613
|
+
|
1614
|
+
// if (i >= GGML_CUDA_MMQ_Y) {
|
1615
|
+
// return;
|
1616
|
+
// }
|
1617
|
+
|
1618
|
+
// const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1619
|
+
|
1620
|
+
// x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
|
1621
|
+
// }
|
1506
1622
|
}
|
1507
1623
|
|
1508
|
-
static __device__ __forceinline__ float
|
1509
|
-
const
|
1624
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
1625
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1626
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1510
1627
|
|
1511
|
-
|
1512
|
-
|
1628
|
+
__builtin_assume(i >= 0);
|
1629
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1630
|
+
__builtin_assume(j >= 0);
|
1631
|
+
__builtin_assume(j < WARP_SIZE);
|
1632
|
+
__builtin_assume(k >= 0);
|
1633
|
+
__builtin_assume(k < WARP_SIZE);
|
1513
1634
|
|
1514
|
-
const int
|
1515
|
-
const
|
1635
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1636
|
+
const float * x_dmf = (float *) x_dm;
|
1516
1637
|
|
1517
|
-
|
1638
|
+
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
1518
1639
|
|
1519
|
-
|
1640
|
+
#pragma unroll
|
1641
|
+
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
|
1642
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1643
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
|
1644
|
+
}
|
1520
1645
|
|
1521
|
-
|
1522
|
-
|
1646
|
+
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
|
1647
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
|
1648
|
+
y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1649
|
+
}
|
1523
1650
|
|
1524
|
-
|
1525
|
-
|
1526
|
-
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
|
1527
|
-
vh >>= bq8_offset;
|
1651
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1652
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1528
1653
|
|
1529
|
-
|
1530
|
-
const int isc = scale_offset + 2*i;
|
1654
|
+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1531
1655
|
|
1532
|
-
|
1533
|
-
|
1534
|
-
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
|
1656
|
+
int v[VDR_Q4_1_Q8_1_MMVQ];
|
1657
|
+
int u[2*VDR_Q4_1_Q8_1_MMVQ];
|
1535
1658
|
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1659
|
+
#pragma unroll
|
1660
|
+
for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
|
1661
|
+
v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
|
1662
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1663
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
|
1664
|
+
}
|
1539
1665
|
|
1540
|
-
|
1666
|
+
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
1667
|
+
}
|
1541
1668
|
|
1542
|
-
|
1543
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1544
|
-
const float d8i = bq8i->d;
|
1669
|
+
static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1545
1670
|
|
1546
|
-
|
1671
|
+
__shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
|
1672
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
|
1547
1673
|
|
1548
|
-
|
1674
|
+
*x_ql = tile_x_qs;
|
1675
|
+
*x_dm = tile_x_dm;
|
1676
|
+
}
|
1549
1677
|
|
1550
|
-
|
1678
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
1679
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1680
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1681
|
+
|
1682
|
+
__builtin_assume(i_offset >= 0);
|
1683
|
+
__builtin_assume(i_offset < 8);
|
1684
|
+
__builtin_assume(k >= 0);
|
1685
|
+
__builtin_assume(k < WARP_SIZE);
|
1686
|
+
|
1687
|
+
const int kbx = k / QI4_1;
|
1688
|
+
const int kqsx = k % QI4_1;
|
1689
|
+
|
1690
|
+
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
1551
1691
|
|
1552
|
-
|
1692
|
+
#pragma unroll
|
1693
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1694
|
+
int i = i0 + i_offset;
|
1695
|
+
|
1696
|
+
if (need_check) {
|
1697
|
+
i = min(i, i_max);
|
1698
|
+
}
|
1699
|
+
|
1700
|
+
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
|
1701
|
+
|
1702
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
1553
1703
|
}
|
1554
1704
|
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
#
|
1705
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
|
1706
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1707
|
+
|
1708
|
+
#pragma unroll
|
1709
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
|
1710
|
+
int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
|
1711
|
+
|
1712
|
+
if (need_check) {
|
1713
|
+
i = min(i, i_max);
|
1714
|
+
}
|
1715
|
+
|
1716
|
+
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1717
|
+
|
1718
|
+
x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
|
1719
|
+
}
|
1559
1720
|
}
|
1560
1721
|
|
1561
|
-
static __device__ __forceinline__ float
|
1562
|
-
const
|
1722
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
1723
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1724
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1563
1725
|
|
1564
|
-
|
1565
|
-
|
1726
|
+
__builtin_assume(i >= 0);
|
1727
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1728
|
+
__builtin_assume(j >= 0);
|
1729
|
+
__builtin_assume(j < WARP_SIZE);
|
1730
|
+
__builtin_assume(k >= 0);
|
1731
|
+
__builtin_assume(k < WARP_SIZE);
|
1566
1732
|
|
1567
|
-
|
1568
|
-
float sumf_m = 0.0f;
|
1733
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1569
1734
|
|
1570
|
-
|
1735
|
+
int u[2*VDR_Q4_1_Q8_1_MMQ];
|
1571
1736
|
|
1572
|
-
|
1573
|
-
|
1737
|
+
#pragma unroll
|
1738
|
+
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
|
1739
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1740
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
|
1741
|
+
}
|
1574
1742
|
|
1575
|
-
|
1576
|
-
|
1743
|
+
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
|
1744
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
|
1745
|
+
y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1746
|
+
}
|
1577
1747
|
|
1578
|
-
|
1579
|
-
|
1580
|
-
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
1581
|
-
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
1748
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1749
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1582
1750
|
|
1583
|
-
const
|
1584
|
-
const int v1 = q4[0];
|
1585
|
-
const int v2 = q4[4];
|
1751
|
+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1586
1752
|
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1590
|
-
|
1591
|
-
|
1592
|
-
|
1593
|
-
|
1594
|
-
|
1595
|
-
|
1753
|
+
int vl[VDR_Q5_0_Q8_1_MMVQ];
|
1754
|
+
int vh[VDR_Q5_0_Q8_1_MMVQ];
|
1755
|
+
int u[2*VDR_Q5_0_Q8_1_MMVQ];
|
1756
|
+
|
1757
|
+
#pragma unroll
|
1758
|
+
for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
|
1759
|
+
vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
|
1760
|
+
vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
|
1761
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1762
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
|
1596
1763
|
}
|
1597
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
1598
|
-
const uint8_t * m = sc + 2;
|
1599
1764
|
|
1600
|
-
|
1765
|
+
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
|
1766
|
+
}
|
1767
|
+
|
1768
|
+
static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1769
|
+
|
1770
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
1771
|
+
__shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
|
1772
|
+
|
1773
|
+
*x_ql = tile_x_ql;
|
1774
|
+
*x_dm = (half2 *) tile_x_d;
|
1775
|
+
}
|
1776
|
+
|
1777
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
1778
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1779
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1780
|
+
|
1781
|
+
__builtin_assume(i_offset >= 0);
|
1782
|
+
__builtin_assume(i_offset < 8);
|
1783
|
+
__builtin_assume(k >= 0);
|
1784
|
+
__builtin_assume(k < WARP_SIZE);
|
1785
|
+
|
1786
|
+
const int kbx = k / QI5_0;
|
1787
|
+
const int kqsx = k % QI5_0;
|
1788
|
+
|
1789
|
+
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
1790
|
+
|
1791
|
+
#pragma unroll
|
1792
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1793
|
+
int i = i0 + i_offset;
|
1794
|
+
|
1795
|
+
if (need_check) {
|
1796
|
+
i = min(i, i_max);
|
1797
|
+
}
|
1798
|
+
|
1799
|
+
const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1800
|
+
|
1801
|
+
const int ql = get_int_from_uint8(bxi->qs, kqsx);
|
1802
|
+
const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
|
1803
|
+
|
1804
|
+
int qs0 = (ql >> 0) & 0x0F0F0F0F;
|
1805
|
+
qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
|
1806
|
+
qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
|
1807
|
+
qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
|
1808
|
+
qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
|
1809
|
+
qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
|
1810
|
+
|
1811
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
|
1812
|
+
|
1813
|
+
int qs1 = (ql >> 4) & 0x0F0F0F0F;
|
1814
|
+
qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
|
1815
|
+
qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
|
1816
|
+
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
1817
|
+
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
1818
|
+
qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
|
1819
|
+
|
1820
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
|
1821
|
+
}
|
1822
|
+
|
1823
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
|
1824
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1825
|
+
float * x_dmf = (float *) x_dm;
|
1826
|
+
|
1827
|
+
#pragma unroll
|
1828
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
|
1829
|
+
int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
|
1830
|
+
|
1831
|
+
if (need_check) {
|
1832
|
+
i = min(i, i_max);
|
1833
|
+
}
|
1834
|
+
|
1835
|
+
const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1836
|
+
|
1837
|
+
x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
|
1838
|
+
}
|
1839
|
+
}
|
1840
|
+
|
1841
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
1842
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1843
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1844
|
+
|
1845
|
+
__builtin_assume(i >= 0);
|
1846
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1847
|
+
__builtin_assume(j >= 0);
|
1848
|
+
__builtin_assume(j < WARP_SIZE);
|
1849
|
+
__builtin_assume(k >= 0);
|
1850
|
+
__builtin_assume(k < WARP_SIZE);
|
1851
|
+
|
1852
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1853
|
+
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
1854
|
+
const float * x_dmf = (float *) x_dm;
|
1855
|
+
|
1856
|
+
int u[2*VDR_Q5_0_Q8_1_MMQ];
|
1857
|
+
|
1858
|
+
#pragma unroll
|
1859
|
+
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
|
1860
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1861
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
|
1865
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1866
|
+
}
|
1867
|
+
|
1868
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1869
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1870
|
+
|
1871
|
+
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1872
|
+
|
1873
|
+
int vl[VDR_Q5_1_Q8_1_MMVQ];
|
1874
|
+
int vh[VDR_Q5_1_Q8_1_MMVQ];
|
1875
|
+
int u[2*VDR_Q5_1_Q8_1_MMVQ];
|
1876
|
+
|
1877
|
+
#pragma unroll
|
1878
|
+
for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
|
1879
|
+
vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
|
1880
|
+
vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
|
1881
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1882
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
|
1883
|
+
}
|
1884
|
+
|
1885
|
+
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
|
1886
|
+
}
|
1887
|
+
|
1888
|
+
static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1889
|
+
|
1890
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
1891
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
|
1892
|
+
|
1893
|
+
*x_ql = tile_x_ql;
|
1894
|
+
*x_dm = tile_x_dm;
|
1895
|
+
}
|
1896
|
+
|
1897
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
1898
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1899
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1900
|
+
|
1901
|
+
__builtin_assume(i_offset >= 0);
|
1902
|
+
__builtin_assume(i_offset < 8);
|
1903
|
+
__builtin_assume(k >= 0);
|
1904
|
+
__builtin_assume(k < WARP_SIZE);
|
1905
|
+
|
1906
|
+
const int kbx = k / QI5_1;
|
1907
|
+
const int kqsx = k % QI5_1;
|
1908
|
+
|
1909
|
+
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
1910
|
+
|
1911
|
+
#pragma unroll
|
1912
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1913
|
+
int i = i0 + i_offset;
|
1914
|
+
|
1915
|
+
if (need_check) {
|
1916
|
+
i = min(i, i_max);
|
1917
|
+
}
|
1918
|
+
|
1919
|
+
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
|
1920
|
+
|
1921
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
1922
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
|
1923
|
+
|
1924
|
+
int qs0 = (ql >> 0) & 0x0F0F0F0F;
|
1925
|
+
qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
|
1926
|
+
qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
|
1927
|
+
qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
|
1928
|
+
qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
|
1929
|
+
|
1930
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
|
1931
|
+
|
1932
|
+
int qs1 = (ql >> 4) & 0x0F0F0F0F;
|
1933
|
+
qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
|
1934
|
+
qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
|
1935
|
+
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
1936
|
+
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
1937
|
+
|
1938
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
|
1939
|
+
}
|
1940
|
+
|
1941
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
|
1942
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1943
|
+
|
1944
|
+
#pragma unroll
|
1945
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
|
1946
|
+
int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
|
1947
|
+
|
1948
|
+
if (need_check) {
|
1949
|
+
i = min(i, i_max);
|
1950
|
+
}
|
1951
|
+
|
1952
|
+
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1953
|
+
|
1954
|
+
x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
|
1955
|
+
}
|
1956
|
+
}
|
1957
|
+
|
1958
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
1959
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1960
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1961
|
+
|
1962
|
+
__builtin_assume(i >= 0);
|
1963
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1964
|
+
__builtin_assume(j >= 0);
|
1965
|
+
__builtin_assume(j < WARP_SIZE);
|
1966
|
+
__builtin_assume(k >= 0);
|
1967
|
+
__builtin_assume(k < WARP_SIZE);
|
1968
|
+
|
1969
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1970
|
+
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
1971
|
+
|
1972
|
+
int u[2*VDR_Q5_1_Q8_1_MMQ];
|
1973
|
+
|
1974
|
+
#pragma unroll
|
1975
|
+
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
|
1976
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1977
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
|
1978
|
+
}
|
1979
|
+
|
1980
|
+
return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
|
1981
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1982
|
+
}
|
1983
|
+
|
1984
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1985
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1986
|
+
|
1987
|
+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1988
|
+
|
1989
|
+
int v[VDR_Q8_0_Q8_1_MMVQ];
|
1990
|
+
int u[VDR_Q8_0_Q8_1_MMVQ];
|
1991
|
+
|
1992
|
+
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
|
1993
|
+
v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
|
1994
|
+
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1995
|
+
}
|
1996
|
+
|
1997
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
|
1998
|
+
}
|
1999
|
+
|
2000
|
+
static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2001
|
+
|
2002
|
+
__shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2003
|
+
__shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
|
2004
|
+
|
2005
|
+
*x_ql = tile_x_qs;
|
2006
|
+
*x_dm = (half2 *) tile_x_d;
|
2007
|
+
}
|
2008
|
+
|
2009
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2010
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2011
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2012
|
+
|
2013
|
+
__builtin_assume(i_offset >= 0);
|
2014
|
+
__builtin_assume(i_offset < 8);
|
2015
|
+
__builtin_assume(k >= 0);
|
2016
|
+
__builtin_assume(k < WARP_SIZE);
|
2017
|
+
|
2018
|
+
const int kbx = k / QI8_0;
|
2019
|
+
const int kqsx = k % QI8_0;
|
2020
|
+
float * x_dmf = (float *) x_dm;
|
2021
|
+
|
2022
|
+
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2023
|
+
|
2024
|
+
#pragma unroll
|
2025
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2026
|
+
int i = i0 + i_offset;
|
2027
|
+
|
2028
|
+
if (need_check) {
|
2029
|
+
i = min(i, i_max);
|
2030
|
+
}
|
2031
|
+
|
2032
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
2033
|
+
|
2034
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
|
2035
|
+
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
|
2036
|
+
}
|
2037
|
+
|
2038
|
+
// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
|
2039
|
+
// const int kbxd = k % blocks_per_tile_x_row;
|
2040
|
+
|
2041
|
+
// #pragma unroll
|
2042
|
+
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
|
2043
|
+
// FIXME out-of-bounds
|
2044
|
+
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2045
|
+
|
2046
|
+
// #if GGML_CUDA_MMQ_Y < 64
|
2047
|
+
// if (i >= GGML_CUDA_MMQ_Y) {
|
2048
|
+
// return;
|
2049
|
+
// }
|
2050
|
+
// #endif // GGML_CUDA_MMQ_Y < 64
|
2051
|
+
|
2052
|
+
// const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2053
|
+
|
2054
|
+
// x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
|
2055
|
+
// }
|
2056
|
+
}
|
2057
|
+
|
2058
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2059
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2060
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2061
|
+
|
2062
|
+
__builtin_assume(i >= 0);
|
2063
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2064
|
+
__builtin_assume(j >= 0);
|
2065
|
+
__builtin_assume(j < WARP_SIZE);
|
2066
|
+
__builtin_assume(k >= 0);
|
2067
|
+
__builtin_assume(k < WARP_SIZE);
|
2068
|
+
|
2069
|
+
const float * x_dmf = (float *) x_dm;
|
2070
|
+
|
2071
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
|
2072
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
|
2073
|
+
y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
|
2074
|
+
}
|
2075
|
+
|
2076
|
+
#define VDR_q2_K_q8_1 1
|
2077
|
+
|
2078
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
|
2079
|
+
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2080
|
+
const half2 & dm, const float * __restrict__ d8) {
|
2081
|
+
|
2082
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2083
|
+
float sumf_d = 0.0f;
|
2084
|
+
float sumf_m = 0.0f;
|
2085
|
+
|
2086
|
+
for (int i = 0; i < QR2_K; ++i) {
|
2087
|
+
const int sc = scales[2*i];
|
2088
|
+
|
2089
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
2090
|
+
|
2091
|
+
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
2092
|
+
|
2093
|
+
int sc_high = sc >> 4;
|
2094
|
+
sc_high |= sc_high << 8;
|
2095
|
+
sc_high |= sc_high << 16;
|
2096
|
+
sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
2097
|
+
}
|
2098
|
+
|
2099
|
+
const float2 dmf = __half22float2(dm);
|
2100
|
+
|
2101
|
+
return dmf.x*sumf_d - dmf.y*sumf_m;
|
2102
|
+
#else
|
2103
|
+
return 0.0f; // only to satisfy the compiler
|
2104
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2105
|
+
}
|
2106
|
+
|
2107
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
2108
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2109
|
+
|
2110
|
+
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
2111
|
+
|
2112
|
+
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
2113
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
2114
|
+
|
2115
|
+
const uint8_t * scales = bq2_K->scales + scale_offset;
|
2116
|
+
|
2117
|
+
const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
|
2118
|
+
int u[QR2_K];
|
2119
|
+
float d8[QR2_K];
|
2120
|
+
|
2121
|
+
for (int i = 0; i < QR2_K; ++ i) {
|
2122
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2123
|
+
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2124
|
+
}
|
2125
|
+
|
2126
|
+
return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8);
|
2127
|
+
}
|
2128
|
+
|
2129
|
+
static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2130
|
+
|
2131
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2132
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
|
2133
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
|
2134
|
+
|
2135
|
+
*x_ql = tile_x_ql;
|
2136
|
+
*x_dm = tile_x_dm;
|
2137
|
+
*x_sc = tile_x_sc;
|
2138
|
+
}
|
2139
|
+
|
2140
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2141
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2142
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2143
|
+
|
2144
|
+
__builtin_assume(i_offset >= 0);
|
2145
|
+
__builtin_assume(i_offset < 8);
|
2146
|
+
__builtin_assume(k >= 0);
|
2147
|
+
__builtin_assume(k < WARP_SIZE);
|
2148
|
+
|
2149
|
+
const int kbx = k / QI2_K;
|
2150
|
+
const int kqsx = k % QI2_K;
|
2151
|
+
|
2152
|
+
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2153
|
+
|
2154
|
+
#pragma unroll
|
2155
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2156
|
+
int i = i0 + i_offset;
|
2157
|
+
|
2158
|
+
if (need_check) {
|
2159
|
+
i = min(i, i_max);
|
2160
|
+
}
|
2161
|
+
|
2162
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2163
|
+
|
2164
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2165
|
+
}
|
2166
|
+
|
2167
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
|
2168
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2169
|
+
|
2170
|
+
#pragma unroll
|
2171
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
|
2172
|
+
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2173
|
+
|
2174
|
+
if (need_check) {
|
2175
|
+
i = min(i, i_max);
|
2176
|
+
}
|
2177
|
+
|
2178
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2179
|
+
|
2180
|
+
x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
|
2181
|
+
}
|
2182
|
+
|
2183
|
+
#pragma unroll
|
2184
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
|
2185
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2186
|
+
|
2187
|
+
if (need_check) {
|
2188
|
+
i = min(i, i_max);
|
2189
|
+
}
|
2190
|
+
|
2191
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
|
2192
|
+
|
2193
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
|
2194
|
+
}
|
2195
|
+
}
|
2196
|
+
|
2197
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
2198
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2199
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2200
|
+
|
2201
|
+
__builtin_assume(i >= 0);
|
2202
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2203
|
+
__builtin_assume(j >= 0);
|
2204
|
+
__builtin_assume(j < WARP_SIZE);
|
2205
|
+
__builtin_assume(k >= 0);
|
2206
|
+
__builtin_assume(k < WARP_SIZE);
|
2207
|
+
|
2208
|
+
const int kbx = k / QI2_K;
|
2209
|
+
const int kqsx = k % QI2_K;
|
2210
|
+
|
2211
|
+
const int bq8_offset = QR2_K * (kqsx / QI8_1);
|
2212
|
+
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2213
|
+
|
2214
|
+
const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset;
|
2215
|
+
|
2216
|
+
int u[QR2_K];
|
2217
|
+
float d8[QR2_K];
|
2218
|
+
|
2219
|
+
for (int l = 0; l < QR2_K; ++ l) {
|
2220
|
+
const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2221
|
+
u[l] = y_qs[y_qs_index];
|
2222
|
+
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2223
|
+
}
|
2224
|
+
|
2225
|
+
return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8);
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
#define VDR_q3_K_q8_1 1
|
2229
|
+
|
2230
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
|
2231
|
+
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2232
|
+
const int & scale_offset, const float & d, const float * __restrict__ d8) {
|
2233
|
+
|
2234
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2235
|
+
float sumf = 0.0f;
|
2236
|
+
|
2237
|
+
for (int i = 0; i < QR3_K; ++i) {
|
2238
|
+
const int isc = scale_offset + 2*i;
|
2239
|
+
|
2240
|
+
const int isc_low = isc % (QK_K/32);
|
2241
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
2242
|
+
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
2243
|
+
|
2244
|
+
const int isc_high = isc % (QK_K/64);
|
2245
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
2246
|
+
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
2247
|
+
|
2248
|
+
const int sc = (sc_low | sc_high) - 32;
|
2249
|
+
|
2250
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
2251
|
+
|
2252
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
2253
|
+
|
2254
|
+
const int vi = __vsubss4(vil, vih);
|
2255
|
+
|
2256
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
2257
|
+
}
|
2258
|
+
|
2259
|
+
return d*sumf;
|
2260
|
+
#else
|
2261
|
+
return 0.0f; // only to satisfy the compiler
|
2262
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2263
|
+
}
|
2264
|
+
|
2265
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
2266
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2267
|
+
|
2268
|
+
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
2269
|
+
|
2270
|
+
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
2271
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
2272
|
+
|
2273
|
+
const float d = bq3_K->d;
|
2274
|
+
|
2275
|
+
const int vl = get_int_from_uint8(bq3_K->qs, iqs);
|
2276
|
+
|
2277
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2278
|
+
const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
|
2279
|
+
|
2280
|
+
int u[QR3_K];
|
2281
|
+
float d8[QR3_K];
|
2282
|
+
|
2283
|
+
for (int i = 0; i < QR3_K; ++i) {
|
2284
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2285
|
+
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2286
|
+
}
|
2287
|
+
|
2288
|
+
return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
2289
|
+
}
|
2290
|
+
|
2291
|
+
static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2292
|
+
|
2293
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2294
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
|
2295
|
+
__shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
|
2296
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
|
2297
|
+
|
2298
|
+
*x_ql = tile_x_ql;
|
2299
|
+
*x_dm = tile_x_dm;
|
2300
|
+
*x_qh = tile_x_qh;
|
2301
|
+
*x_sc = tile_x_sc;
|
2302
|
+
}
|
2303
|
+
|
2304
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2305
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2306
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2307
|
+
|
2308
|
+
__builtin_assume(i_offset >= 0);
|
2309
|
+
__builtin_assume(i_offset < 8);
|
2310
|
+
__builtin_assume(k >= 0);
|
2311
|
+
__builtin_assume(k < WARP_SIZE);
|
2312
|
+
|
2313
|
+
const int kbx = k / QI3_K;
|
2314
|
+
const int kqsx = k % QI3_K;
|
2315
|
+
|
2316
|
+
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2317
|
+
|
2318
|
+
#pragma unroll
|
2319
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2320
|
+
int i = i0 + i_offset;
|
2321
|
+
|
2322
|
+
if (need_check) {
|
2323
|
+
i = min(i, i_max);
|
2324
|
+
}
|
2325
|
+
|
2326
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2327
|
+
|
2328
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
|
2332
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2333
|
+
|
2334
|
+
#pragma unroll
|
2335
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
|
2336
|
+
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2337
|
+
|
2338
|
+
if (need_check) {
|
2339
|
+
i = min(i, i_max);
|
2340
|
+
}
|
2341
|
+
|
2342
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2343
|
+
|
2344
|
+
x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d;
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
#pragma unroll
|
2348
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
|
2349
|
+
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
2350
|
+
|
2351
|
+
if (need_check) {
|
2352
|
+
i = min(i, i_max);
|
2353
|
+
}
|
2354
|
+
|
2355
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
|
2356
|
+
|
2357
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
|
2358
|
+
}
|
2359
|
+
|
2360
|
+
#pragma unroll
|
2361
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
|
2362
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2363
|
+
|
2364
|
+
if (need_check) {
|
2365
|
+
i = min(i, i_max);
|
2366
|
+
}
|
2367
|
+
|
2368
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
|
2369
|
+
|
2370
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4));
|
2371
|
+
}
|
2372
|
+
}
|
2373
|
+
|
2374
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
2375
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2376
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2377
|
+
|
2378
|
+
__builtin_assume(i >= 0);
|
2379
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2380
|
+
__builtin_assume(j >= 0);
|
2381
|
+
__builtin_assume(j < WARP_SIZE);
|
2382
|
+
__builtin_assume(k >= 0);
|
2383
|
+
__builtin_assume(k < WARP_SIZE);
|
2384
|
+
|
2385
|
+
const int kbx = k / QI3_K;
|
2386
|
+
const int kqsx = k % QI3_K;
|
2387
|
+
|
2388
|
+
const int bq8_offset = QR3_K * (kqsx / (QI3_K/2));
|
2389
|
+
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2390
|
+
|
2391
|
+
const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
|
2392
|
+
|
2393
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2394
|
+
const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
|
2395
|
+
|
2396
|
+
int u[QR3_K];
|
2397
|
+
float d8[QR3_K];
|
2398
|
+
|
2399
|
+
for (int l = 0; l < QR3_K; ++ l) {
|
2400
|
+
const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2401
|
+
u[l] = y_qs[y_qs_index];
|
2402
|
+
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2403
|
+
}
|
2404
|
+
|
2405
|
+
return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
|
2406
|
+
x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
|
2407
|
+
}
|
2408
|
+
|
2409
|
+
#define VDR_q4_K_q8_1 2
|
2410
|
+
|
2411
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
|
2412
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2413
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
2414
|
+
|
2415
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2416
|
+
float sumf_d = 0.0f;
|
2417
|
+
float sumf_m = 0.0f;
|
2418
|
+
|
2419
|
+
for (int i = 0; i < QR4_K; ++i) {
|
2420
|
+
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
|
2421
|
+
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
|
2422
|
+
|
2423
|
+
const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
|
2424
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
|
2425
|
+
|
2426
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
2427
|
+
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
2428
|
+
}
|
2429
|
+
|
2430
|
+
return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m;
|
2431
|
+
|
2432
|
+
#else
|
2433
|
+
return 0.0f; // only to satisfy the compiler
|
2434
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2435
|
+
}
|
2436
|
+
|
2437
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
2438
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2439
|
+
|
2440
|
+
#ifndef GGML_QKK_64
|
2441
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
2442
|
+
|
2443
|
+
int v[2];
|
2444
|
+
int u[2*QR4_K];
|
2445
|
+
float d8[QR4_K];
|
2446
|
+
|
2447
|
+
// iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
|
2448
|
+
const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
|
2449
|
+
|
2450
|
+
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
2451
|
+
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
2452
|
+
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
2453
|
+
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
2454
|
+
|
2455
|
+
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
|
2456
|
+
v[0] = q4[0];
|
2457
|
+
v[1] = q4[4];
|
2458
|
+
|
2459
|
+
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
2460
|
+
uint16_t aux[2];
|
2461
|
+
const int j = bq8_offset/2;
|
2462
|
+
if (j < 2) {
|
2463
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
2464
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
2465
|
+
} else {
|
2466
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
2467
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
2468
|
+
}
|
2469
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2470
|
+
const uint8_t * m = sc + 2;
|
2471
|
+
|
2472
|
+
for (int i = 0; i < QR4_K; ++i) {
|
2473
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2474
|
+
d8[i] = bq8i->ds.x;
|
2475
|
+
|
2476
|
+
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2477
|
+
u[2*i+0] = q8[0];
|
2478
|
+
u[2*i+1] = q8[4];
|
2479
|
+
}
|
2480
|
+
|
2481
|
+
return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8);
|
2482
|
+
|
2483
|
+
#else
|
2484
|
+
|
2485
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2486
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
2487
|
+
|
2488
|
+
float sumf_d = 0.0f;
|
2489
|
+
float sumf_m = 0.0f;
|
2490
|
+
|
2491
|
+
uint16_t aux16[2];
|
2492
|
+
const uint8_t * s = (const uint8_t *)aux16;
|
2493
|
+
|
2494
|
+
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
2495
|
+
aux16[0] = a[0] & 0x0f0f;
|
2496
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2497
|
+
|
2498
|
+
const float dall = bq4_K->d[0];
|
2499
|
+
const float dmin = bq4_K->d[1];
|
2500
|
+
|
2501
|
+
const float d8_1 = bq8_1[0].ds.x;
|
2502
|
+
const float d8_2 = bq8_1[1].ds.x;
|
2503
|
+
|
2504
|
+
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2505
|
+
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
2506
|
+
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
2507
|
+
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
2508
|
+
|
2509
|
+
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
2510
|
+
const int v1 = q4[0];
|
2511
|
+
const int v2 = q4[4];
|
2512
|
+
|
2513
|
+
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
2514
|
+
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
2515
|
+
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
2516
|
+
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
2517
|
+
|
2518
|
+
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
2519
|
+
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
2520
|
+
|
2521
|
+
return dall * sumf_d - dmin * sumf_m;
|
2522
|
+
|
2523
|
+
#else
|
2524
|
+
return 0.0f; // only to satisfy the compiler
|
2525
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2526
|
+
|
2527
|
+
#endif
|
2528
|
+
}
|
2529
|
+
|
2530
|
+
static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2531
|
+
|
2532
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2533
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
|
2534
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
2535
|
+
|
2536
|
+
*x_ql = tile_x_ql;
|
2537
|
+
*x_dm = tile_x_dm;
|
2538
|
+
*x_sc = tile_x_sc;
|
2539
|
+
}
|
2540
|
+
|
2541
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2542
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2543
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2544
|
+
|
2545
|
+
__builtin_assume(i_offset >= 0);
|
2546
|
+
__builtin_assume(i_offset < 8);
|
2547
|
+
__builtin_assume(k >= 0);
|
2548
|
+
__builtin_assume(k < WARP_SIZE);
|
2549
|
+
|
2550
|
+
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
2551
|
+
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
2552
|
+
|
2553
|
+
const block_q4_K * bx0 = (block_q4_K *) vx;
|
2554
|
+
|
2555
|
+
#pragma unroll
|
2556
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2557
|
+
int i = i0 + i_offset;
|
2558
|
+
|
2559
|
+
if (need_check) {
|
2560
|
+
i = min(i, i_max);
|
2561
|
+
}
|
2562
|
+
|
2563
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2564
|
+
|
2565
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2566
|
+
}
|
2567
|
+
|
2568
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
2569
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2570
|
+
|
2571
|
+
#pragma unroll
|
2572
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
|
2573
|
+
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2574
|
+
|
2575
|
+
if (need_check) {
|
2576
|
+
i = min(i, i_max);
|
2577
|
+
}
|
2578
|
+
|
2579
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2580
|
+
|
2581
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2582
|
+
}
|
2583
|
+
|
2584
|
+
#pragma unroll
|
2585
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2586
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
2587
|
+
|
2588
|
+
if (need_check) {
|
2589
|
+
i = min(i, i_max);
|
2590
|
+
}
|
2591
|
+
|
2592
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
2593
|
+
|
2594
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
|
2595
|
+
}
|
2596
|
+
}
|
2597
|
+
|
2598
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2599
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2600
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2601
|
+
|
2602
|
+
__builtin_assume(i >= 0);
|
2603
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2604
|
+
__builtin_assume(j >= 0);
|
2605
|
+
__builtin_assume(j < WARP_SIZE);
|
2606
|
+
__builtin_assume(k >= 0);
|
2607
|
+
__builtin_assume(k < WARP_SIZE);
|
2608
|
+
|
2609
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2610
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2611
|
+
|
2612
|
+
int v[2];
|
2613
|
+
int u[2*QR4_K];
|
2614
|
+
float d8[QR4_K];
|
2615
|
+
|
2616
|
+
// kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6
|
2617
|
+
const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
|
2618
|
+
|
2619
|
+
v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
|
2620
|
+
v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
|
2621
|
+
|
2622
|
+
const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
|
2623
|
+
uint16_t aux[2];
|
2624
|
+
const int l = bq8_offset/2;
|
2625
|
+
if (l < 2) {
|
2626
|
+
aux[0] = scales[l+0] & 0x3f3f;
|
2627
|
+
aux[1] = scales[l+2] & 0x3f3f;
|
2628
|
+
} else {
|
2629
|
+
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2630
|
+
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2631
|
+
}
|
2632
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2633
|
+
const uint8_t * m = sc + 2;
|
2634
|
+
|
2635
|
+
for (int l = 0; l < QR4_K; ++l) {
|
2636
|
+
const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2637
|
+
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2638
|
+
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2639
|
+
d8[l] = y_ds[kqsy / QI8_1].x;
|
2640
|
+
}
|
2641
|
+
|
2642
|
+
return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
|
2643
|
+
}
|
2644
|
+
|
2645
|
+
#define VDR_q5_K_q8_1 2
|
2646
|
+
|
2647
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
2648
|
+
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2649
|
+
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
2650
|
+
|
2651
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2652
|
+
float sumf_d = 0.0f;
|
2653
|
+
float sumf_m = 0.0f;
|
2654
|
+
|
2655
|
+
for (int i = 0; i < QR5_K; ++i) {
|
2656
|
+
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
2657
|
+
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
2658
|
+
|
2659
|
+
const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
|
2660
|
+
const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
|
2661
|
+
|
2662
|
+
const int v0i = vl0i | vh0i;
|
2663
|
+
const int v1i = vl1i | vh1i;
|
2664
|
+
|
2665
|
+
const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
2666
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
2667
|
+
|
2668
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
2669
|
+
sumf_m += d8[i] * (dot2 * m[i]);
|
2670
|
+
|
2671
|
+
}
|
2672
|
+
|
2673
|
+
return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m;
|
2674
|
+
|
2675
|
+
#else
|
2676
|
+
return 0.0f; // only to satisfy the compiler
|
2677
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2678
|
+
}
|
2679
|
+
|
2680
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
2681
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2682
|
+
|
2683
|
+
#ifndef GGML_QKK_64
|
2684
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
2685
|
+
|
2686
|
+
int vl[2];
|
2687
|
+
int vh[2];
|
2688
|
+
int u[2*QR5_K];
|
2689
|
+
float d8[QR5_K];
|
2690
|
+
|
2691
|
+
const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
|
2692
|
+
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
|
2693
|
+
const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
|
2694
|
+
|
2695
|
+
vl[0] = ql[0];
|
2696
|
+
vl[1] = ql[4];
|
2697
|
+
|
2698
|
+
vh[0] = qh[0] >> bq8_offset;
|
2699
|
+
vh[1] = qh[4] >> bq8_offset;
|
2700
|
+
|
2701
|
+
const uint16_t * scales = (const uint16_t *)bq5_K->scales;
|
2702
|
+
uint16_t aux[2];
|
2703
|
+
const int j = bq8_offset/2;
|
2704
|
+
if (j < 2) {
|
2705
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
2706
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
2707
|
+
} else {
|
2708
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
2709
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
2710
|
+
}
|
2711
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2712
|
+
const uint8_t * m = sc + 2;
|
2713
|
+
|
2714
|
+
for (int i = 0; i < QR5_K; ++i) {
|
2715
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2716
|
+
d8[i] = bq8i->ds.x;
|
2717
|
+
|
2718
|
+
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2719
|
+
u[2*i+0] = q8[0];
|
2720
|
+
u[2*i+1] = q8[4];
|
2721
|
+
}
|
2722
|
+
|
2723
|
+
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
|
2724
|
+
|
2725
|
+
#else
|
2726
|
+
|
2727
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2728
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
2729
|
+
|
2730
|
+
const int8_t * s = bq5_K->scales;
|
2731
|
+
|
2732
|
+
const float d = bq5_K->d;
|
2733
|
+
|
2734
|
+
const float d8_1 = bq8_1[0].ds.x;
|
2735
|
+
const float d8_2 = bq8_1[1].ds.x;
|
2736
|
+
|
2737
|
+
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2738
|
+
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
2739
|
+
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
2740
|
+
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
2741
|
+
|
2742
|
+
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
2743
|
+
const int vl1 = ql[0];
|
2744
|
+
const int vl2 = ql[4];
|
2745
|
+
|
2746
|
+
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
2747
|
+
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
2748
|
+
const int in = step%8; // 0, 4, 0, 4
|
2749
|
+
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
2750
|
+
|
2751
|
+
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
2752
|
+
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
2753
|
+
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
2754
|
+
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
2755
|
+
|
2756
|
+
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
2757
|
+
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
2758
|
+
|
2759
|
+
return d * sumf_d;
|
2760
|
+
|
2761
|
+
#else
|
2762
|
+
return 0.0f; // only to satisfy the compiler
|
2763
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2764
|
+
|
2765
|
+
#endif
|
2766
|
+
}
|
2767
|
+
|
2768
|
+
static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2769
|
+
|
2770
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2771
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
|
2772
|
+
__shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
|
2773
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
2774
|
+
|
2775
|
+
*x_ql = tile_x_ql;
|
2776
|
+
*x_dm = tile_x_dm;
|
2777
|
+
*x_qh = tile_x_qh;
|
2778
|
+
*x_sc = tile_x_sc;
|
2779
|
+
}
|
2780
|
+
|
2781
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2782
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2783
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2784
|
+
|
2785
|
+
__builtin_assume(i_offset >= 0);
|
2786
|
+
__builtin_assume(i_offset < 8);
|
2787
|
+
__builtin_assume(k >= 0);
|
2788
|
+
__builtin_assume(k < WARP_SIZE);
|
2789
|
+
|
2790
|
+
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
2791
|
+
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
2792
|
+
|
2793
|
+
const block_q5_K * bx0 = (block_q5_K *) vx;
|
2794
|
+
|
2795
|
+
#pragma unroll
|
2796
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2797
|
+
int i = i0 + i_offset;
|
2798
|
+
|
2799
|
+
if (need_check) {
|
2800
|
+
i = min(i, i_max);
|
2801
|
+
}
|
2802
|
+
|
2803
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2804
|
+
|
2805
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2806
|
+
}
|
2807
|
+
|
2808
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
2809
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2810
|
+
|
2811
|
+
#pragma unroll
|
2812
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
|
2813
|
+
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2814
|
+
|
2815
|
+
if (need_check) {
|
2816
|
+
i = min(i, i_max);
|
2817
|
+
}
|
2818
|
+
|
2819
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2820
|
+
|
2821
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
#pragma unroll
|
2825
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
|
2826
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2827
|
+
|
2828
|
+
if (need_check) {
|
2829
|
+
i = min(i, i_max);
|
2830
|
+
}
|
2831
|
+
|
2832
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
|
2833
|
+
|
2834
|
+
x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
|
2835
|
+
}
|
2836
|
+
|
2837
|
+
#pragma unroll
|
2838
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2839
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
2840
|
+
|
2841
|
+
if (need_check) {
|
2842
|
+
i = min(i, i_max);
|
2843
|
+
}
|
2844
|
+
|
2845
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
2846
|
+
|
2847
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
|
2848
|
+
}
|
2849
|
+
}
|
2850
|
+
|
2851
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
2852
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2853
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2854
|
+
|
2855
|
+
__builtin_assume(i >= 0);
|
2856
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2857
|
+
__builtin_assume(j >= 0);
|
2858
|
+
__builtin_assume(j < WARP_SIZE);
|
2859
|
+
__builtin_assume(k >= 0);
|
2860
|
+
__builtin_assume(k < WARP_SIZE);
|
2861
|
+
|
2862
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2863
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2864
|
+
|
2865
|
+
int vl[2];
|
2866
|
+
int vh[2];
|
2867
|
+
int u[2*QR4_K];
|
2868
|
+
float d8[QR4_K];
|
2869
|
+
|
2870
|
+
const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
|
2871
|
+
|
2872
|
+
vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
|
2873
|
+
vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
|
2874
|
+
|
2875
|
+
vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
|
2876
|
+
vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
|
2877
|
+
|
2878
|
+
const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
|
2879
|
+
uint16_t aux[2];
|
2880
|
+
const int l = bq8_offset/2;
|
2881
|
+
if (l < 2) {
|
2882
|
+
aux[0] = scales[l+0] & 0x3f3f;
|
2883
|
+
aux[1] = scales[l+2] & 0x3f3f;
|
2884
|
+
} else {
|
2885
|
+
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2886
|
+
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2887
|
+
}
|
2888
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2889
|
+
const uint8_t * m = sc + 2;
|
2890
|
+
|
2891
|
+
for (int l = 0; l < QR5_K; ++l) {
|
2892
|
+
const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2893
|
+
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2894
|
+
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2895
|
+
d8[l] = y_ds[kqsy / QI8_1].x;
|
2896
|
+
}
|
2897
|
+
|
2898
|
+
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
|
2899
|
+
}
|
2900
|
+
|
2901
|
+
#define VDR_q6_K_q8_1 1
|
2902
|
+
|
2903
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
|
2904
|
+
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
2905
|
+
const float & d, const float * __restrict__ d8) {
|
2906
|
+
|
2907
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2908
|
+
float sumf = 0.0f;
|
2909
|
+
|
2910
|
+
for (int i = 0; i < QR6_K; ++i) {
|
2911
|
+
const int sc = scales[4*i];
|
2912
|
+
|
2913
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
2914
|
+
|
2915
|
+
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
2916
|
+
|
2917
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
2918
|
+
|
2919
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
2920
|
+
}
|
2921
|
+
|
2922
|
+
return d*sumf;
|
2923
|
+
#else
|
2924
|
+
return 0.0f; // only to satisfy the compiler
|
2925
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2926
|
+
}
|
2927
|
+
|
2928
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
2929
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2930
|
+
|
2931
|
+
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
2932
|
+
|
2933
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
2934
|
+
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
2935
|
+
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
1601
2936
|
|
1602
|
-
|
1603
|
-
|
1604
|
-
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1605
|
-
const int ui1 = q8[0];
|
1606
|
-
const int ui2 = q8[4];
|
2937
|
+
const int vl = get_int_from_uint8(bq6_K->ql, iqs);
|
2938
|
+
const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
|
1607
2939
|
|
1608
|
-
|
1609
|
-
const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
|
2940
|
+
const int8_t * scales = bq6_K->scales + scale_offset;
|
1610
2941
|
|
1611
|
-
|
1612
|
-
|
2942
|
+
int u[QR6_K];
|
2943
|
+
float d8[QR6_K];
|
1613
2944
|
|
1614
|
-
|
1615
|
-
|
2945
|
+
for (int i = 0; i < QR6_K; ++i) {
|
2946
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
2947
|
+
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
1616
2948
|
}
|
1617
2949
|
|
1618
|
-
return d
|
2950
|
+
return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8);
|
2951
|
+
}
|
1619
2952
|
|
1620
|
-
|
2953
|
+
static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1621
2954
|
|
1622
|
-
|
1623
|
-
|
2955
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2956
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
|
2957
|
+
__shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
|
2958
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
1624
2959
|
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
2960
|
+
*x_ql = tile_x_ql;
|
2961
|
+
*x_dm = tile_x_dm;
|
2962
|
+
*x_qh = tile_x_qh;
|
2963
|
+
*x_sc = tile_x_sc;
|
2964
|
+
}
|
1628
2965
|
|
1629
|
-
|
1630
|
-
const
|
2966
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
2967
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2968
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1631
2969
|
|
1632
|
-
|
1633
|
-
|
2970
|
+
__builtin_assume(i_offset >= 0);
|
2971
|
+
__builtin_assume(i_offset < 8);
|
2972
|
+
__builtin_assume(k >= 0);
|
2973
|
+
__builtin_assume(k < WARP_SIZE);
|
1634
2974
|
|
1635
|
-
const int
|
1636
|
-
const int
|
1637
|
-
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1638
|
-
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
2975
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2976
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
1639
2977
|
|
1640
|
-
const
|
1641
|
-
const int v1 = q4[0];
|
1642
|
-
const int v2 = q4[4];
|
2978
|
+
const block_q6_K * bx0 = (block_q6_K *) vx;
|
1643
2979
|
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
2980
|
+
#pragma unroll
|
2981
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2982
|
+
int i = i0 + i_offset;
|
1648
2983
|
|
1649
|
-
|
1650
|
-
|
2984
|
+
if (need_check) {
|
2985
|
+
i = min(i, i_max);
|
2986
|
+
}
|
1651
2987
|
|
1652
|
-
|
2988
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
|
1653
2989
|
|
1654
|
-
|
2990
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx);
|
2991
|
+
}
|
1655
2992
|
|
1656
|
-
|
1657
|
-
|
1658
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1659
|
-
}
|
2993
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
2994
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
1660
2995
|
|
1661
|
-
|
1662
|
-
|
2996
|
+
#pragma unroll
|
2997
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
|
2998
|
+
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
1663
2999
|
|
1664
|
-
|
1665
|
-
|
3000
|
+
if (need_check) {
|
3001
|
+
i = min(i, i_max);
|
3002
|
+
}
|
1666
3003
|
|
1667
|
-
|
3004
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
1668
3005
|
|
1669
|
-
|
1670
|
-
|
1671
|
-
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
|
3006
|
+
x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d;
|
3007
|
+
}
|
1672
3008
|
|
1673
|
-
|
1674
|
-
|
3009
|
+
#pragma unroll
|
3010
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
|
3011
|
+
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
1675
3012
|
|
1676
|
-
|
1677
|
-
|
3013
|
+
if (need_check) {
|
3014
|
+
i = min(i, i_max);
|
3015
|
+
}
|
1678
3016
|
|
1679
|
-
|
1680
|
-
const int vl2 = ql[4];
|
3017
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2);
|
1681
3018
|
|
1682
|
-
|
1683
|
-
|
3019
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2));
|
3020
|
+
}
|
1684
3021
|
|
1685
|
-
|
1686
|
-
|
1687
|
-
|
1688
|
-
|
1689
|
-
|
1690
|
-
|
1691
|
-
|
1692
|
-
|
1693
|
-
|
3022
|
+
#pragma unroll
|
3023
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
3024
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
3025
|
+
|
3026
|
+
if (need_check) {
|
3027
|
+
i = min(i, i_max);
|
3028
|
+
}
|
3029
|
+
|
3030
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
|
3031
|
+
|
3032
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
|
1694
3033
|
}
|
1695
|
-
|
1696
|
-
const uint8_t * m = sc + 2;
|
3034
|
+
}
|
1697
3035
|
|
1698
|
-
|
3036
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
3037
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3038
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1699
3039
|
|
1700
|
-
|
1701
|
-
|
1702
|
-
|
1703
|
-
|
1704
|
-
|
3040
|
+
__builtin_assume(i >= 0);
|
3041
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
3042
|
+
__builtin_assume(j >= 0);
|
3043
|
+
__builtin_assume(j < WARP_SIZE);
|
3044
|
+
__builtin_assume(k >= 0);
|
3045
|
+
__builtin_assume(k < WARP_SIZE);
|
1705
3046
|
|
1706
|
-
|
1707
|
-
|
3047
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3048
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
1708
3049
|
|
1709
|
-
|
1710
|
-
|
3050
|
+
const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
|
3051
|
+
const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
|
3052
|
+
const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
|
1711
3053
|
|
1712
|
-
|
1713
|
-
const int vi2 = vil2 | vih2;
|
3054
|
+
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
|
1714
3055
|
|
1715
|
-
|
1716
|
-
|
3056
|
+
const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
|
3057
|
+
const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
|
1717
3058
|
|
1718
|
-
|
1719
|
-
|
3059
|
+
int u[QR6_K];
|
3060
|
+
float d8[QR6_K];
|
1720
3061
|
|
3062
|
+
for (int l = 0; l < QR6_K; ++l) {
|
3063
|
+
const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
|
3064
|
+
u[l] = y_qs[kqsy];
|
3065
|
+
d8[l] = y_ds[kqsy / QI8_1].x;
|
1721
3066
|
}
|
1722
3067
|
|
1723
|
-
return
|
1724
|
-
|
1725
|
-
|
3068
|
+
return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
|
3069
|
+
x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
|
3070
|
+
}
|
1726
3071
|
|
1727
|
-
|
3072
|
+
template <int qk, int qr, int qi, typename block_q_t,
|
3073
|
+
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3074
|
+
static __global__ void mul_mat_q(
|
3075
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3076
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
1728
3077
|
|
1729
|
-
const
|
3078
|
+
const block_q_t * x = (const block_q_t *) vx;
|
3079
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1730
3080
|
|
1731
|
-
const
|
1732
|
-
const
|
3081
|
+
const int blocks_per_row_x = ncols_x / qk;
|
3082
|
+
const int blocks_per_col_y = nrows_y / QK8_1;
|
3083
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
1733
3084
|
|
1734
|
-
const int
|
1735
|
-
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
1736
|
-
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1737
|
-
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
3085
|
+
const int & ncols_dst = ncols_y;
|
1738
3086
|
|
1739
|
-
const int
|
1740
|
-
const int
|
1741
|
-
const int vl2 = ql[4];
|
3087
|
+
const int tid_x = threadIdx.x;
|
3088
|
+
const int tid_y = threadIdx.y;
|
1742
3089
|
|
1743
|
-
const int
|
1744
|
-
const int
|
1745
|
-
const int
|
1746
|
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
3090
|
+
const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
|
3091
|
+
const int & row_x_0 = row_dst_0;
|
3092
|
+
const int row_dst = row_dst_0 + tid_x;
|
1747
3093
|
|
1748
|
-
const int
|
1749
|
-
const int
|
1750
|
-
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
1751
|
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
3094
|
+
const int col_dst_0 = blockIdx.y*WARP_SIZE;
|
3095
|
+
const int & col_y_0 = col_dst_0;
|
1752
3096
|
|
1753
|
-
|
1754
|
-
|
3097
|
+
int * tile_x_ql = nullptr;
|
3098
|
+
half2 * tile_x_dm = nullptr;
|
3099
|
+
int * tile_x_qh = nullptr;
|
3100
|
+
int * tile_x_sc = nullptr;
|
1755
3101
|
|
1756
|
-
|
3102
|
+
allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
|
1757
3103
|
|
1758
|
-
|
3104
|
+
const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
|
1759
3105
|
|
1760
|
-
|
1761
|
-
|
1762
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1763
|
-
}
|
3106
|
+
__shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
|
3107
|
+
__shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
|
1764
3108
|
|
1765
|
-
|
1766
|
-
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
3109
|
+
float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
|
1767
3110
|
|
1768
|
-
|
1769
|
-
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
3111
|
+
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
1770
3112
|
|
1771
|
-
|
1772
|
-
|
1773
|
-
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
3113
|
+
load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
|
3114
|
+
tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
|
1774
3115
|
|
1775
|
-
|
3116
|
+
for (int ir = 0; ir < qr; ++ir) {
|
3117
|
+
const int kqs = ir*WARP_SIZE + tid_x;
|
3118
|
+
const int kbxd = kqs / QI8_1;
|
1776
3119
|
|
1777
|
-
|
3120
|
+
for (int i = 0; i < WARP_SIZE; i += 8) {
|
3121
|
+
const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
|
1778
3122
|
|
1779
|
-
|
1780
|
-
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
3123
|
+
const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
|
1781
3124
|
|
1782
|
-
|
1783
|
-
|
3125
|
+
tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
|
3126
|
+
}
|
3127
|
+
}
|
1784
3128
|
|
1785
|
-
|
1786
|
-
|
3129
|
+
for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
|
3130
|
+
const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
|
3131
|
+
const int kby = tid_x % blocks_per_tile_y_col;
|
3132
|
+
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
3133
|
+
tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
|
3134
|
+
}
|
1787
3135
|
|
1788
|
-
|
1789
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
1790
|
-
const float d8i = bq8i->d;
|
3136
|
+
__syncthreads();
|
1791
3137
|
|
1792
|
-
|
3138
|
+
#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
|
3139
|
+
#pragma unroll
|
3140
|
+
#endif // __CUDA_ARCH__ >= 700
|
3141
|
+
for (int k = 0; k < WARP_SIZE; k += vdr) {
|
3142
|
+
#pragma unroll
|
3143
|
+
for (int j = 0; j < WARP_SIZE; j += 8) {
|
3144
|
+
#pragma unroll
|
3145
|
+
for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
|
3146
|
+
sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
|
3147
|
+
tid_x + i, tid_y + j, k);
|
3148
|
+
}
|
3149
|
+
}
|
3150
|
+
}
|
1793
3151
|
|
1794
|
-
|
3152
|
+
__syncthreads();
|
3153
|
+
}
|
1795
3154
|
|
1796
|
-
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1797
3155
|
|
1798
|
-
|
3156
|
+
if (row_dst >= nrows_dst) {
|
3157
|
+
return;
|
1799
3158
|
}
|
1800
3159
|
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
3160
|
+
for (int j = 0; j < WARP_SIZE; j += 8) {
|
3161
|
+
const int col_dst = col_dst_0 + j + tid_y;
|
3162
|
+
|
3163
|
+
if (col_dst >= ncols_dst) {
|
3164
|
+
return;
|
3165
|
+
}
|
3166
|
+
|
3167
|
+
for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
|
3168
|
+
dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
|
3169
|
+
}
|
3170
|
+
}
|
1805
3171
|
}
|
1806
3172
|
|
1807
|
-
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
3173
|
+
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1808
3174
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
1809
3175
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1810
3176
|
|
@@ -1813,7 +3179,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1813
3179
|
}
|
1814
3180
|
|
1815
3181
|
const int blocks_per_row = ncols / qk;
|
1816
|
-
const int blocks_per_warp = WARP_SIZE / qi;
|
3182
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
1817
3183
|
|
1818
3184
|
// partial sum for each thread
|
1819
3185
|
float tmp = 0.0f;
|
@@ -1822,11 +3188,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1822
3188
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1823
3189
|
|
1824
3190
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1825
|
-
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
3191
|
+
const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
|
1826
3192
|
|
1827
|
-
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
3193
|
+
const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
|
1828
3194
|
|
1829
|
-
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
3195
|
+
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
|
1830
3196
|
|
1831
3197
|
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
1832
3198
|
}
|
@@ -1859,11 +3225,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1859
3225
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
1860
3226
|
|
1861
3227
|
// partial sum for each thread
|
1862
|
-
#ifdef
|
3228
|
+
#ifdef GGML_CUDA_F16
|
1863
3229
|
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
1864
3230
|
#else
|
1865
3231
|
float tmp = 0.0f;
|
1866
|
-
#endif //
|
3232
|
+
#endif // GGML_CUDA_F16
|
1867
3233
|
|
1868
3234
|
for (int i = 0; i < ncols; i += iter_stride) {
|
1869
3235
|
const int col = i + vals_per_iter*tid;
|
@@ -1883,7 +3249,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1883
3249
|
|
1884
3250
|
// matrix multiplication
|
1885
3251
|
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
1886
|
-
#ifdef
|
3252
|
+
#ifdef GGML_CUDA_F16
|
1887
3253
|
tmp += __hmul2(v, {
|
1888
3254
|
y[iybs + iqs + j/qr + 0],
|
1889
3255
|
y[iybs + iqs + j/qr + y_offset]
|
@@ -1891,7 +3257,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1891
3257
|
#else
|
1892
3258
|
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
1893
3259
|
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
1894
|
-
#endif //
|
3260
|
+
#endif // GGML_CUDA_F16
|
1895
3261
|
}
|
1896
3262
|
}
|
1897
3263
|
|
@@ -1902,11 +3268,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1902
3268
|
}
|
1903
3269
|
|
1904
3270
|
if (tid == 0) {
|
1905
|
-
#ifdef
|
3271
|
+
#ifdef GGML_CUDA_F16
|
1906
3272
|
dst[row] = tmp.x + tmp.y;
|
1907
3273
|
#else
|
1908
3274
|
dst[row] = tmp;
|
1909
|
-
#endif //
|
3275
|
+
#endif // GGML_CUDA_F16
|
1910
3276
|
}
|
1911
3277
|
}
|
1912
3278
|
|
@@ -2046,7 +3412,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
2046
3412
|
}
|
2047
3413
|
|
2048
3414
|
// rope == RoPE == rotary positional embedding
|
2049
|
-
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float
|
3415
|
+
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
|
3416
|
+
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
2050
3417
|
const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
2051
3418
|
|
2052
3419
|
if (col >= ncols) {
|
@@ -2056,7 +3423,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
2056
3423
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
2057
3424
|
const int i = row*ncols + col;
|
2058
3425
|
|
2059
|
-
const float theta =
|
3426
|
+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
2060
3427
|
const float sin_theta = sinf(theta);
|
2061
3428
|
const float cos_theta = cosf(theta);
|
2062
3429
|
|
@@ -2203,9 +3570,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
2203
3570
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
2204
3571
|
}
|
2205
3572
|
|
2206
|
-
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int
|
2207
|
-
const int
|
2208
|
-
|
3573
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
3574
|
+
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
3575
|
+
const dim3 num_blocks(block_num_x, ky, 1);
|
3576
|
+
const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
|
3577
|
+
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
2209
3578
|
}
|
2210
3579
|
|
2211
3580
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -2366,7 +3735,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2366
3735
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2367
3736
|
const dim3 block_nums(1, block_num_y, 1);
|
2368
3737
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2369
|
-
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
3738
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
2370
3739
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2371
3740
|
}
|
2372
3741
|
|
@@ -2375,7 +3744,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2375
3744
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2376
3745
|
const dim3 block_nums(1, block_num_y, 1);
|
2377
3746
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2378
|
-
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
3747
|
+
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
2379
3748
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2380
3749
|
}
|
2381
3750
|
|
@@ -2384,7 +3753,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2384
3753
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2385
3754
|
const dim3 block_nums(1, block_num_y, 1);
|
2386
3755
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2387
|
-
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
3756
|
+
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
2388
3757
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2389
3758
|
}
|
2390
3759
|
|
@@ -2393,7 +3762,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2393
3762
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2394
3763
|
const dim3 block_nums(1, block_num_y, 1);
|
2395
3764
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2396
|
-
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
3765
|
+
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
2397
3766
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2398
3767
|
}
|
2399
3768
|
|
@@ -2402,7 +3771,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2402
3771
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2403
3772
|
const dim3 block_nums(1, block_num_y, 1);
|
2404
3773
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2405
|
-
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
3774
|
+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
2406
3775
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2407
3776
|
}
|
2408
3777
|
|
@@ -2411,7 +3780,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2411
3780
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2412
3781
|
const dim3 block_nums(1, block_num_y, 1);
|
2413
3782
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2414
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
3783
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1>
|
2415
3784
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2416
3785
|
}
|
2417
3786
|
|
@@ -2420,7 +3789,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2420
3789
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2421
3790
|
const dim3 block_nums(1, block_num_y, 1);
|
2422
3791
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2423
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
3792
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1>
|
2424
3793
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2425
3794
|
}
|
2426
3795
|
|
@@ -2429,10 +3798,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2429
3798
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2430
3799
|
const dim3 block_nums(1, block_num_y, 1);
|
2431
3800
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2432
|
-
|
2433
|
-
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2434
|
-
// is better amortized.
|
2435
|
-
mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
|
3801
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1>
|
2436
3802
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2437
3803
|
}
|
2438
3804
|
|
@@ -2441,10 +3807,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2441
3807
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2442
3808
|
const dim3 block_nums(1, block_num_y, 1);
|
2443
3809
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2444
|
-
|
2445
|
-
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2446
|
-
// is better amortized.
|
2447
|
-
mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
|
3810
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1>
|
2448
3811
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2449
3812
|
}
|
2450
3813
|
|
@@ -2453,7 +3816,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2453
3816
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2454
3817
|
const dim3 block_nums(1, block_num_y, 1);
|
2455
3818
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2456
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
3819
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1>
|
2457
3820
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2458
3821
|
}
|
2459
3822
|
|
@@ -2500,6 +3863,186 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
2500
3863
|
}
|
2501
3864
|
}
|
2502
3865
|
|
3866
|
+
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
3867
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3868
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3869
|
+
|
3870
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3871
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3872
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3873
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3874
|
+
|
3875
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3876
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3877
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3878
|
+
} else {
|
3879
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3880
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3881
|
+
}
|
3882
|
+
}
|
3883
|
+
|
3884
|
+
static void ggml_mul_mat_q4_1_q8_1_cuda(
|
3885
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3886
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3887
|
+
|
3888
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3889
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3890
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3891
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3892
|
+
|
3893
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3894
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3895
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3896
|
+
} else {
|
3897
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3898
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3899
|
+
}
|
3900
|
+
}
|
3901
|
+
|
3902
|
+
static void ggml_mul_mat_q5_0_q8_1_cuda(
|
3903
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3904
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3905
|
+
|
3906
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3907
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3908
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3909
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3910
|
+
|
3911
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3912
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3913
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3914
|
+
} else {
|
3915
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3916
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3917
|
+
}
|
3918
|
+
}
|
3919
|
+
|
3920
|
+
static void ggml_mul_mat_q5_1_q8_1_cuda(
|
3921
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3922
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3923
|
+
|
3924
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3925
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3926
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3927
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3928
|
+
|
3929
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3930
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3931
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3932
|
+
} else {
|
3933
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3934
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3935
|
+
}
|
3936
|
+
}
|
3937
|
+
|
3938
|
+
static void ggml_mul_mat_q8_0_q8_1_cuda(
|
3939
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3940
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3941
|
+
|
3942
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3943
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3944
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3945
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3946
|
+
|
3947
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3948
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3949
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3950
|
+
} else {
|
3951
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3952
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3953
|
+
}
|
3954
|
+
}
|
3955
|
+
|
3956
|
+
static void ggml_mul_mat_q2_K_q8_1_cuda(
|
3957
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3958
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3959
|
+
|
3960
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3961
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3962
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3963
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3964
|
+
|
3965
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3966
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
|
3967
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3968
|
+
} else {
|
3969
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
|
3970
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3971
|
+
}
|
3972
|
+
}
|
3973
|
+
|
3974
|
+
static void ggml_mul_mat_q3_K_q8_1_cuda(
|
3975
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3976
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3977
|
+
|
3978
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3979
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3980
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3981
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3982
|
+
|
3983
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3984
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
|
3985
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
|
+
} else {
|
3987
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
|
3988
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3989
|
+
}
|
3990
|
+
}
|
3991
|
+
|
3992
|
+
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
3993
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3994
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3995
|
+
|
3996
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3997
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3998
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3999
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
4000
|
+
|
4001
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
4002
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
|
4003
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4004
|
+
} else {
|
4005
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
|
4006
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4007
|
+
}
|
4008
|
+
}
|
4009
|
+
|
4010
|
+
static void ggml_mul_mat_q5_K_q8_1_cuda(
|
4011
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4012
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4013
|
+
|
4014
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
4015
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
4016
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4017
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
4018
|
+
|
4019
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
4020
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
|
4021
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4022
|
+
} else {
|
4023
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
|
4024
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4025
|
+
}
|
4026
|
+
}
|
4027
|
+
|
4028
|
+
static void ggml_mul_mat_q6_K_q8_1_cuda(
|
4029
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4030
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4031
|
+
|
4032
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
4033
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
4034
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4035
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
4036
|
+
|
4037
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
4038
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
|
4039
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4040
|
+
} else {
|
4041
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
|
4042
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4043
|
+
}
|
4044
|
+
}
|
4045
|
+
|
2503
4046
|
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2504
4047
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
2505
4048
|
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
@@ -2544,12 +4087,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
2544
4087
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
2545
4088
|
}
|
2546
4089
|
|
2547
|
-
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
4090
|
+
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4091
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
2548
4092
|
GGML_ASSERT(nrows % 2 == 0);
|
2549
4093
|
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
2550
4094
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
2551
4095
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
2552
|
-
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
4096
|
+
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
2553
4097
|
}
|
2554
4098
|
|
2555
4099
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
@@ -2676,10 +4220,9 @@ static size_t g_scratch_offset = 0;
|
|
2676
4220
|
|
2677
4221
|
static int g_device_count = -1;
|
2678
4222
|
static int g_main_device = 0;
|
2679
|
-
#ifndef GGML_CUDA_FORCE_DMMV
|
2680
4223
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
2681
|
-
#endif
|
2682
4224
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
4225
|
+
static bool g_mul_mat_q = false;
|
2683
4226
|
|
2684
4227
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
2685
4228
|
|
@@ -2701,9 +4244,7 @@ void ggml_init_cublas() {
|
|
2701
4244
|
g_tensor_split[id] = total_vram;
|
2702
4245
|
total_vram += prop.totalGlobalMem;
|
2703
4246
|
|
2704
|
-
#ifndef GGML_CUDA_FORCE_DMMV
|
2705
4247
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
2706
|
-
#endif
|
2707
4248
|
}
|
2708
4249
|
for (int id = 0; id < g_device_count; ++id) {
|
2709
4250
|
g_tensor_split[id] /= total_vram;
|
@@ -2965,6 +4506,83 @@ inline void ggml_cuda_op_rms_norm(
|
|
2965
4506
|
(void) i1;
|
2966
4507
|
}
|
2967
4508
|
|
4509
|
+
inline void ggml_cuda_op_mul_mat_q(
|
4510
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
4511
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
4512
|
+
cudaStream_t & cudaStream_main){
|
4513
|
+
|
4514
|
+
GGML_ASSERT(src0_ddq_i != nullptr);
|
4515
|
+
GGML_ASSERT(src1_ddf_i != nullptr);
|
4516
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
4517
|
+
|
4518
|
+
const int64_t ne00 = src0->ne[0];
|
4519
|
+
|
4520
|
+
const int64_t ne10 = src1->ne[0];
|
4521
|
+
const int64_t ne11 = src1->ne[1];
|
4522
|
+
GGML_ASSERT(ne10 % QK8_1 == 0);
|
4523
|
+
|
4524
|
+
const int64_t ne0 = dst->ne[0];
|
4525
|
+
|
4526
|
+
const int64_t i01_diff = i01_high - i01_low;
|
4527
|
+
|
4528
|
+
int id;
|
4529
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4530
|
+
|
4531
|
+
// the main device has a larger memory buffer to hold the results from all GPUs
|
4532
|
+
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
4533
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
|
4534
|
+
|
4535
|
+
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
4536
|
+
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
4537
|
+
size_t as;
|
4538
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
|
4539
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
|
4540
|
+
|
4541
|
+
switch (src0->type) {
|
4542
|
+
case GGML_TYPE_Q4_0:
|
4543
|
+
ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4544
|
+
break;
|
4545
|
+
case GGML_TYPE_Q4_1:
|
4546
|
+
ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4547
|
+
break;
|
4548
|
+
case GGML_TYPE_Q5_0:
|
4549
|
+
ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4550
|
+
break;
|
4551
|
+
case GGML_TYPE_Q5_1:
|
4552
|
+
ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4553
|
+
break;
|
4554
|
+
case GGML_TYPE_Q8_0:
|
4555
|
+
ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4556
|
+
break;
|
4557
|
+
case GGML_TYPE_Q2_K:
|
4558
|
+
ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4559
|
+
break;
|
4560
|
+
case GGML_TYPE_Q3_K:
|
4561
|
+
ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4562
|
+
break;
|
4563
|
+
case GGML_TYPE_Q4_K:
|
4564
|
+
ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4565
|
+
break;
|
4566
|
+
case GGML_TYPE_Q5_K:
|
4567
|
+
ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4568
|
+
break;
|
4569
|
+
case GGML_TYPE_Q6_K:
|
4570
|
+
ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4571
|
+
break;
|
4572
|
+
default:
|
4573
|
+
GGML_ASSERT(false);
|
4574
|
+
break;
|
4575
|
+
}
|
4576
|
+
|
4577
|
+
ggml_cuda_pool_free(src1_q8_1, as);
|
4578
|
+
|
4579
|
+
(void) src1;
|
4580
|
+
(void) dst;
|
4581
|
+
(void) src0_ddf_i;
|
4582
|
+
(void) i02;
|
4583
|
+
(void) i1;
|
4584
|
+
}
|
4585
|
+
|
2968
4586
|
inline void ggml_cuda_op_mul_mat_vec(
|
2969
4587
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2970
4588
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2979,6 +4597,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2979
4597
|
|
2980
4598
|
#ifdef GGML_CUDA_FORCE_DMMV
|
2981
4599
|
const bool use_mul_mat_vec_q = false;
|
4600
|
+
(void) g_compute_capabilities[0];
|
2982
4601
|
#else
|
2983
4602
|
int id;
|
2984
4603
|
CUDA_CHECK(cudaGetDevice(&id));
|
@@ -3006,7 +4625,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3006
4625
|
ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
3007
4626
|
size_t as;
|
3008
4627
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
3009
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
4628
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
|
3010
4629
|
|
3011
4630
|
switch (src0->type) {
|
3012
4631
|
case GGML_TYPE_Q4_0:
|
@@ -3047,7 +4666,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3047
4666
|
ggml_cuda_pool_free(src1_q8_1, as);
|
3048
4667
|
} else {
|
3049
4668
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
3050
|
-
#ifdef
|
4669
|
+
#ifdef GGML_CUDA_F16
|
3051
4670
|
size_t ash;
|
3052
4671
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
3053
4672
|
|
@@ -3063,7 +4682,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3063
4682
|
}
|
3064
4683
|
#else
|
3065
4684
|
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
3066
|
-
#endif //
|
4685
|
+
#endif // GGML_CUDA_F16
|
3067
4686
|
|
3068
4687
|
switch (src0->type) {
|
3069
4688
|
case GGML_TYPE_Q4_0:
|
@@ -3104,11 +4723,11 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3104
4723
|
break;
|
3105
4724
|
}
|
3106
4725
|
|
3107
|
-
#ifdef
|
4726
|
+
#ifdef GGML_CUDA_F16
|
3108
4727
|
if (src1_convert_f16) {
|
3109
4728
|
ggml_cuda_pool_free(src1_dfloat, ash);
|
3110
4729
|
}
|
3111
|
-
#endif //
|
4730
|
+
#endif // GGML_CUDA_F16
|
3112
4731
|
}
|
3113
4732
|
|
3114
4733
|
(void) src1;
|
@@ -3168,6 +4787,7 @@ inline void ggml_cuda_op_rope(
|
|
3168
4787
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
3169
4788
|
|
3170
4789
|
const int64_t ne00 = src0->ne[0];
|
4790
|
+
const int64_t ne01 = src0->ne[1];
|
3171
4791
|
const int64_t i01_diff = i01_high - i01_low;
|
3172
4792
|
|
3173
4793
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
@@ -3181,17 +4801,18 @@ inline void ggml_cuda_op_rope(
|
|
3181
4801
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
3182
4802
|
|
3183
4803
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
3184
|
-
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
3185
4804
|
|
3186
|
-
bool is_glm = mode & 4;
|
4805
|
+
const bool is_glm = mode & 4;
|
3187
4806
|
|
3188
4807
|
// compute
|
3189
4808
|
if (is_glm) {
|
4809
|
+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
3190
4810
|
const float id_p = min(p, n_ctx - 2.f);
|
3191
4811
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
3192
4812
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
3193
4813
|
} else {
|
3194
|
-
|
4814
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
4815
|
+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
3195
4816
|
}
|
3196
4817
|
|
3197
4818
|
(void) src1;
|
@@ -3363,7 +4984,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3363
4984
|
int64_t row_low, row_high;
|
3364
4985
|
if (split) {
|
3365
4986
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
3366
|
-
|
4987
|
+
row_low -= row_low % GGML_CUDA_MMQ_Y;
|
4988
|
+
|
4989
|
+
if (id == g_device_count - 1) {
|
4990
|
+
row_high = nrows0;
|
4991
|
+
} else {
|
4992
|
+
row_high = nrows0*g_tensor_split[id + 1];
|
4993
|
+
row_high -= row_high % GGML_CUDA_MMQ_Y;
|
4994
|
+
}
|
3367
4995
|
} else {
|
3368
4996
|
row_low = 0;
|
3369
4997
|
row_high = nrows0*i02_divisor;
|
@@ -3529,13 +5157,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3529
5157
|
if (split) {
|
3530
5158
|
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
3531
5159
|
// dst is NOT transposed.
|
3532
|
-
// The outputs of
|
5160
|
+
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
3533
5161
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
3534
5162
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
3535
|
-
|
3536
|
-
|
3537
|
-
|
3538
|
-
}
|
5163
|
+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
|
5164
|
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
|
5165
|
+
i01_diff*sizeof(float), ne1, kind, cudaStream_main));
|
3539
5166
|
} else {
|
3540
5167
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
3541
5168
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
@@ -3718,7 +5345,18 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
3718
5345
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
3719
5346
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
3720
5347
|
} else {
|
3721
|
-
|
5348
|
+
int min_compute_capability = INT_MAX;
|
5349
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5350
|
+
if (min_compute_capability > g_compute_capabilities[id]) {
|
5351
|
+
min_compute_capability = g_compute_capabilities[id];
|
5352
|
+
}
|
5353
|
+
}
|
5354
|
+
|
5355
|
+
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
5356
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
|
5357
|
+
} else {
|
5358
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
5359
|
+
}
|
3722
5360
|
}
|
3723
5361
|
} else {
|
3724
5362
|
GGML_ASSERT(false);
|
@@ -3795,7 +5433,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
|
3795
5433
|
|
3796
5434
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3797
5435
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3798
|
-
|
5436
|
+
|
5437
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
5438
|
+
const bool is_glm = mode & 4;
|
5439
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
3799
5440
|
}
|
3800
5441
|
|
3801
5442
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3828,7 +5469,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3828
5469
|
row_high = nrows;
|
3829
5470
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
3830
5471
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
3831
|
-
|
5472
|
+
row_low -= row_low % GGML_CUDA_MMQ_Y;
|
5473
|
+
|
5474
|
+
if (id == g_device_count - 1) {
|
5475
|
+
row_high = nrows;
|
5476
|
+
} else {
|
5477
|
+
row_high = nrows*g_tensor_split[id + 1];
|
5478
|
+
row_high -= row_high % GGML_CUDA_MMQ_Y;
|
5479
|
+
}
|
3832
5480
|
} else {
|
3833
5481
|
GGML_ASSERT(false);
|
3834
5482
|
}
|
@@ -4002,6 +5650,10 @@ void ggml_cuda_set_main_device(int main_device) {
|
|
4002
5650
|
}
|
4003
5651
|
}
|
4004
5652
|
|
5653
|
+
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
|
5654
|
+
g_mul_mat_q = mul_mat_q;
|
5655
|
+
}
|
5656
|
+
|
4005
5657
|
void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
4006
5658
|
g_scratch_size = scratch_size;
|
4007
5659
|
}
|