llama_cpp 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +315 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2271 -414
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +218 -87
- data/ext/llama_cpp/src/ggml-metal.metal +72 -55
- data/ext/llama_cpp/src/ggml.c +754 -996
- data/ext/llama_cpp/src/ggml.h +94 -18
- data/ext/llama_cpp/src/k_quants.c +350 -24
- data/ext/llama_cpp/src/llama.cpp +713 -179
- data/ext/llama_cpp/src/llama.h +61 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +26 -0
- metadata +4 -2
@@ -52,13 +52,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
52
52
|
} while (0)
|
53
53
|
#endif // CUDART_VERSION >= 11
|
54
54
|
|
55
|
-
#ifdef
|
55
|
+
#ifdef GGML_CUDA_F16
|
56
56
|
typedef half dfloat; // dequantize float
|
57
57
|
typedef half2 dfloat2;
|
58
58
|
#else
|
59
59
|
typedef float dfloat; // dequantize float
|
60
60
|
typedef float2 dfloat2;
|
61
|
-
#endif //
|
61
|
+
#endif //GGML_CUDA_F16
|
62
|
+
|
63
|
+
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
64
|
+
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
65
|
+
|
66
|
+
int x32 = 0;
|
67
|
+
x32 |= x16[0] << 0;
|
68
|
+
x32 |= x16[1] << 16;
|
69
|
+
|
70
|
+
return x32;
|
71
|
+
}
|
72
|
+
|
73
|
+
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
74
|
+
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
75
|
+
|
76
|
+
int x32 = 0;
|
77
|
+
x32 |= x16[0] << 0;
|
78
|
+
x32 |= x16[1] << 16;
|
79
|
+
|
80
|
+
return x32;
|
81
|
+
}
|
82
|
+
|
83
|
+
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
84
|
+
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
85
|
+
}
|
86
|
+
|
87
|
+
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
88
|
+
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
89
|
+
}
|
62
90
|
|
63
91
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
64
92
|
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
@@ -87,8 +115,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
87
115
|
#define QR4_1 2
|
88
116
|
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
89
117
|
typedef struct {
|
90
|
-
|
91
|
-
half m; // min
|
118
|
+
half2 dm; // dm.x = delta, dm.y = min
|
92
119
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
93
120
|
} block_q4_1;
|
94
121
|
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
@@ -107,8 +134,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
107
134
|
#define QR5_1 2
|
108
135
|
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
109
136
|
typedef struct {
|
110
|
-
|
111
|
-
half m; // min
|
137
|
+
half2 dm; // dm.x = delta, dm.y = min
|
112
138
|
uint8_t qh[4]; // 5-th bit of quants
|
113
139
|
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
114
140
|
} block_q5_1;
|
@@ -127,13 +153,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
127
153
|
#define QR8_1 1
|
128
154
|
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
129
155
|
typedef struct {
|
130
|
-
|
131
|
-
half s; // unquantized sum
|
156
|
+
half2 ds; // ds.x = delta, ds.y = sum
|
132
157
|
int8_t qs[QK8_0]; // quants
|
133
158
|
} block_q8_1;
|
134
159
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
135
160
|
|
136
|
-
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
161
|
+
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
162
|
+
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
|
163
|
+
typedef void (*load_tiles_cuda_t)(
|
164
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
165
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
|
166
|
+
typedef float (*vec_dot_q_mul_mat_cuda_t)(
|
167
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
168
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
|
137
169
|
|
138
170
|
//================================= k-quants
|
139
171
|
|
@@ -150,8 +182,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
|
150
182
|
typedef struct {
|
151
183
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
152
184
|
uint8_t qs[QK_K/4]; // quants
|
153
|
-
|
154
|
-
half dmin; // super-block scale for quantized mins
|
185
|
+
half2 dm; // super-block scale for quantized scales/mins
|
155
186
|
} block_q2_K;
|
156
187
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
157
188
|
|
@@ -180,8 +211,7 @@ typedef struct {
|
|
180
211
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
181
212
|
#else
|
182
213
|
typedef struct {
|
183
|
-
|
184
|
-
half dmin; // super-block scale for quantized mins
|
214
|
+
half2 dm; // super-block scale for quantized scales/mins
|
185
215
|
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
186
216
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
187
217
|
} block_q4_K;
|
@@ -200,11 +230,10 @@ typedef struct {
|
|
200
230
|
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
201
231
|
#else
|
202
232
|
typedef struct {
|
203
|
-
|
204
|
-
|
205
|
-
uint8_t
|
206
|
-
uint8_t
|
207
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
233
|
+
half2 dm; // super-block scale for quantized scales/mins
|
234
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
235
|
+
uint8_t qh[QK_K/8]; // quants, high bit
|
236
|
+
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
208
237
|
} block_q5_K;
|
209
238
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
210
239
|
#endif
|
@@ -220,7 +249,7 @@ typedef struct {
|
|
220
249
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
221
250
|
|
222
251
|
#define WARP_SIZE 32
|
223
|
-
#define MATRIX_ROW_PADDING
|
252
|
+
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
224
253
|
|
225
254
|
#define CUDA_ADD_BLOCK_SIZE 256
|
226
255
|
#define CUDA_MUL_BLOCK_SIZE 256
|
@@ -233,6 +262,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
233
262
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
234
263
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
235
264
|
|
265
|
+
#ifndef GGML_CUDA_MMQ_Y
|
266
|
+
#define GGML_CUDA_MMQ_Y 64
|
267
|
+
#endif // GGML_CUDA_MMQ_Y
|
268
|
+
|
236
269
|
// dmmv = dequantize_mul_mat_vec
|
237
270
|
#ifndef GGML_CUDA_DMMV_X
|
238
271
|
#define GGML_CUDA_DMMV_X 32
|
@@ -332,12 +365,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
332
365
|
}
|
333
366
|
}
|
334
367
|
|
335
|
-
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
368
|
+
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
336
369
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
337
370
|
const int tid = threadIdx.x;
|
338
371
|
|
339
|
-
const float eps = 1e-6f;
|
340
|
-
|
341
372
|
float tmp = 0.0f; // partial sum for thread in warp
|
342
373
|
|
343
374
|
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
@@ -369,33 +400,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
369
400
|
v.x = vui & 0xF;
|
370
401
|
v.y = vui >> 4;
|
371
402
|
|
372
|
-
#ifdef
|
403
|
+
#ifdef GGML_CUDA_F16
|
373
404
|
v = __hsub2(v, {8.0f, 8.0f});
|
374
405
|
v = __hmul2(v, {d, d});
|
375
406
|
#else
|
376
407
|
v.x = (v.x - 8.0f) * d;
|
377
408
|
v.y = (v.y - 8.0f) * d;
|
378
|
-
#endif //
|
409
|
+
#endif // GGML_CUDA_F16
|
379
410
|
}
|
380
411
|
|
381
412
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
382
413
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
383
414
|
|
384
|
-
const dfloat d = x[ib].
|
385
|
-
const dfloat m = x[ib].
|
415
|
+
const dfloat d = x[ib].dm.x;
|
416
|
+
const dfloat m = x[ib].dm.y;
|
386
417
|
|
387
418
|
const int vui = x[ib].qs[iqs];
|
388
419
|
|
389
420
|
v.x = vui & 0xF;
|
390
421
|
v.y = vui >> 4;
|
391
422
|
|
392
|
-
#ifdef
|
423
|
+
#ifdef GGML_CUDA_F16
|
393
424
|
v = __hmul2(v, {d, d});
|
394
425
|
v = __hadd2(v, {m, m});
|
395
426
|
#else
|
396
427
|
v.x = (v.x * d) + m;
|
397
428
|
v.y = (v.y * d) + m;
|
398
|
-
#endif //
|
429
|
+
#endif // GGML_CUDA_F16
|
399
430
|
}
|
400
431
|
|
401
432
|
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
@@ -412,20 +443,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
412
443
|
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
413
444
|
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
414
445
|
|
415
|
-
#ifdef
|
446
|
+
#ifdef GGML_CUDA_F16
|
416
447
|
v = __hsub2(v, {16.0f, 16.0f});
|
417
448
|
v = __hmul2(v, {d, d});
|
418
449
|
#else
|
419
450
|
v.x = (v.x - 16.0f) * d;
|
420
451
|
v.y = (v.y - 16.0f) * d;
|
421
|
-
#endif //
|
452
|
+
#endif // GGML_CUDA_F16
|
422
453
|
}
|
423
454
|
|
424
455
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
425
456
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
426
457
|
|
427
|
-
const dfloat d = x[ib].
|
428
|
-
const dfloat m = x[ib].
|
458
|
+
const dfloat d = x[ib].dm.x;
|
459
|
+
const dfloat m = x[ib].dm.y;
|
429
460
|
|
430
461
|
uint32_t qh;
|
431
462
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
@@ -436,13 +467,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
|
436
467
|
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
437
468
|
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
438
469
|
|
439
|
-
#ifdef
|
470
|
+
#ifdef GGML_CUDA_F16
|
440
471
|
v = __hmul2(v, {d, d});
|
441
472
|
v = __hadd2(v, {m, m});
|
442
473
|
#else
|
443
474
|
v.x = (v.x * d) + m;
|
444
475
|
v.y = (v.y * d) + m;
|
445
|
-
#endif //
|
476
|
+
#endif // GGML_CUDA_F16
|
446
477
|
}
|
447
478
|
|
448
479
|
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
@@ -453,12 +484,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
453
484
|
v.x = x[ib].qs[iqs + 0];
|
454
485
|
v.y = x[ib].qs[iqs + 1];
|
455
486
|
|
456
|
-
#ifdef
|
487
|
+
#ifdef GGML_CUDA_F16
|
457
488
|
v = __hmul2(v, {d, d});
|
458
489
|
#else
|
459
490
|
v.x *= d;
|
460
491
|
v.y *= d;
|
461
|
-
#endif //
|
492
|
+
#endif // GGML_CUDA_F16
|
462
493
|
}
|
463
494
|
|
464
495
|
//================================== k-quants
|
@@ -477,8 +508,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
477
508
|
const uint8_t q = x[i].qs[32*n + l];
|
478
509
|
float * y = yy + i*QK_K + 128*n;
|
479
510
|
|
480
|
-
float dall = x[i].
|
481
|
-
float dmin = x[i].
|
511
|
+
float dall = x[i].dm.x;
|
512
|
+
float dmin = x[i].dm.y;
|
482
513
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
483
514
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
484
515
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
@@ -488,8 +519,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
488
519
|
const int il = tid%16; // 0...15
|
489
520
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
490
521
|
float * y = yy + i*QK_K + 16*is + il;
|
491
|
-
float dall = x[i].
|
492
|
-
float dmin = x[i].
|
522
|
+
float dall = x[i].dm.x;
|
523
|
+
float dmin = x[i].dm.y;
|
493
524
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
494
525
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
495
526
|
#endif
|
@@ -575,8 +606,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
575
606
|
|
576
607
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
577
608
|
|
578
|
-
const float dall = x[i].
|
579
|
-
const float dmin = x[i].
|
609
|
+
const float dall = x[i].dm.x;
|
610
|
+
const float dmin = x[i].dm.y;
|
580
611
|
|
581
612
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
582
613
|
|
@@ -614,8 +645,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
614
645
|
|
615
646
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
616
647
|
|
617
|
-
const float dall = x[i].
|
618
|
-
const float dmin = x[i].
|
648
|
+
const float dall = x[i].dm.x;
|
649
|
+
const float dmin = x[i].dm.y;
|
619
650
|
|
620
651
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
621
652
|
const uint8_t * qh = x[i].qh + 2*ir;
|
@@ -727,8 +758,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
727
758
|
const float * y = yy + i * QK_K + y_offset;
|
728
759
|
const uint8_t * q = x[i].qs + q_offset;
|
729
760
|
|
730
|
-
const float dall = x[i].
|
731
|
-
const float dmin = x[i].
|
761
|
+
const float dall = x[i].dm.x;
|
762
|
+
const float dmin = x[i].dm.y;
|
732
763
|
|
733
764
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
734
765
|
aux[0] = a[0] & 0x0f0f0f0f;
|
@@ -770,9 +801,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
770
801
|
uaux[0] = s[0] & 0x0f0f0f0f;
|
771
802
|
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
772
803
|
|
773
|
-
const
|
774
|
-
|
775
|
-
const float2 dall = __half22float2(dh[0]);
|
804
|
+
const float2 dall = __half22float2(x[i].dm);
|
776
805
|
|
777
806
|
float sum1 = 0, sum2 = 0;
|
778
807
|
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
@@ -935,17 +964,23 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
935
964
|
uint16_t aux[4];
|
936
965
|
const uint8_t * sc = (const uint8_t *)aux;
|
937
966
|
|
967
|
+
#if K_QUANTS_PER_ITERATION == 2
|
968
|
+
uint32_t q32[4];
|
969
|
+
const uint8_t * q4 = (const uint8_t *)q32;
|
970
|
+
#else
|
971
|
+
uint16_t q16[4];
|
972
|
+
const uint8_t * q4 = (const uint8_t *)q16;
|
973
|
+
#endif
|
974
|
+
|
938
975
|
float tmp = 0; // partial sum for thread in warp
|
939
976
|
|
940
977
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
941
978
|
|
942
|
-
const uint8_t * q1 = x[i].qs + q_offset;
|
943
|
-
const uint8_t * q2 = q1 + 64;
|
944
979
|
const float * y1 = yy + i*QK_K + y_offset;
|
945
980
|
const float * y2 = y1 + 128;
|
946
981
|
|
947
|
-
const float dall = x[i].
|
948
|
-
const float dmin = x[i].
|
982
|
+
const float dall = x[i].dm.x;
|
983
|
+
const float dmin = x[i].dm.y;
|
949
984
|
|
950
985
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
951
986
|
aux[0] = a[im+0] & kmask1;
|
@@ -953,14 +988,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
953
988
|
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
954
989
|
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
955
990
|
|
991
|
+
#if K_QUANTS_PER_ITERATION == 2
|
992
|
+
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
993
|
+
const uint32_t * q2 = q1 + 16;
|
994
|
+
|
995
|
+
q32[0] = q1[0] & 0x0f0f0f0f;
|
996
|
+
q32[1] = q1[0] & 0xf0f0f0f0;
|
997
|
+
q32[2] = q2[0] & 0x0f0f0f0f;
|
998
|
+
q32[3] = q2[0] & 0xf0f0f0f0;
|
999
|
+
|
956
1000
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
957
1001
|
float smin = 0;
|
958
|
-
for (int l = 0; l <
|
959
|
-
s.x += y1[l] *
|
960
|
-
s.z += y2[l] *
|
1002
|
+
for (int l = 0; l < 4; ++l) {
|
1003
|
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
|
1004
|
+
s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
|
1005
|
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
1006
|
+
}
|
1007
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
1008
|
+
#else
|
1009
|
+
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
1010
|
+
const uint16_t * q2 = q1 + 32;
|
1011
|
+
|
1012
|
+
q16[0] = q1[0] & 0x0f0f;
|
1013
|
+
q16[1] = q1[0] & 0xf0f0;
|
1014
|
+
q16[2] = q2[0] & 0x0f0f;
|
1015
|
+
q16[3] = q2[0] & 0xf0f0;
|
1016
|
+
|
1017
|
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
1018
|
+
float smin = 0;
|
1019
|
+
for (int l = 0; l < 2; ++l) {
|
1020
|
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
1021
|
+
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
961
1022
|
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
962
1023
|
}
|
963
|
-
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
1024
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
1025
|
+
#endif
|
964
1026
|
|
965
1027
|
}
|
966
1028
|
#else
|
@@ -1040,16 +1102,18 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1040
1102
|
uint16_t aux[4];
|
1041
1103
|
const uint8_t * sc = (const uint8_t *)aux;
|
1042
1104
|
|
1105
|
+
uint16_t q16[8];
|
1106
|
+
const uint8_t * q4 = (const uint8_t *)q16;
|
1107
|
+
|
1043
1108
|
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
1044
1109
|
|
1045
1110
|
const uint8_t * ql1 = x[i].qs + q_offset;
|
1046
|
-
const uint8_t * ql2 = ql1 + 64;
|
1047
1111
|
const uint8_t * qh = x[i].qh + l0;
|
1048
1112
|
const float * y1 = yy + i*QK_K + y_offset;
|
1049
1113
|
const float * y2 = y1 + 128;
|
1050
1114
|
|
1051
|
-
const float dall = x[i].
|
1052
|
-
const float dmin = x[i].
|
1115
|
+
const float dall = x[i].dm.x;
|
1116
|
+
const float dmin = x[i].dm.y;
|
1053
1117
|
|
1054
1118
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1055
1119
|
aux[0] = a[im+0] & kmask1;
|
@@ -1059,15 +1123,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1059
1123
|
|
1060
1124
|
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
1061
1125
|
float smin = 0;
|
1126
|
+
const uint16_t * q1 = (const uint16_t *)ql1;
|
1127
|
+
const uint16_t * q2 = q1 + 32;
|
1128
|
+
q16[0] = q1[0] & 0x0f0f;
|
1129
|
+
q16[1] = q1[8] & 0x0f0f;
|
1130
|
+
q16[2] = (q1[0] >> 4) & 0x0f0f;
|
1131
|
+
q16[3] = (q1[8] >> 4) & 0x0f0f;
|
1132
|
+
q16[4] = q2[0] & 0x0f0f;
|
1133
|
+
q16[5] = q2[8] & 0x0f0f;
|
1134
|
+
q16[6] = (q2[0] >> 4) & 0x0f0f;
|
1135
|
+
q16[7] = (q2[8] >> 4) & 0x0f0f;
|
1062
1136
|
for (int l = 0; l < n; ++l) {
|
1063
|
-
sum.x += y1[l+ 0] * (
|
1064
|
-
+ y1[l+16] * (
|
1065
|
-
sum.y += y1[l+32] * (
|
1066
|
-
+ y1[l+48] * (
|
1067
|
-
sum.z += y2[l+ 0] * (
|
1068
|
-
+ y2[l+16] * (
|
1069
|
-
sum.w += y2[l+32] * (
|
1070
|
-
+ y2[l+48] * (
|
1137
|
+
sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
1138
|
+
+ y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
1139
|
+
sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
1140
|
+
+ y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
1141
|
+
sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
1142
|
+
+ y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
1143
|
+
sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
1144
|
+
+ y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
1071
1145
|
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
1072
1146
|
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
1073
1147
|
}
|
@@ -1227,19 +1301,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1227
1301
|
v.y = x[ib + iqs + 1];
|
1228
1302
|
}
|
1229
1303
|
|
1230
|
-
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int
|
1231
|
-
const int
|
1304
|
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
1305
|
+
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
1232
1306
|
|
1233
|
-
if (
|
1307
|
+
if (ix >= kx_padded) {
|
1234
1308
|
return;
|
1235
1309
|
}
|
1236
1310
|
|
1311
|
+
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
|
1312
|
+
|
1313
|
+
const int i_padded = iy*kx_padded + ix;
|
1314
|
+
|
1237
1315
|
block_q8_1 * y = (block_q8_1 *) vy;
|
1238
1316
|
|
1239
|
-
const int ib =
|
1240
|
-
const int iqs =
|
1317
|
+
const int ib = i_padded / QK8_1; // block index
|
1318
|
+
const int iqs = i_padded % QK8_1; // quant index
|
1241
1319
|
|
1242
|
-
const float xi =
|
1320
|
+
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
1243
1321
|
float amax = fabsf(xi);
|
1244
1322
|
float sum = xi;
|
1245
1323
|
|
@@ -1258,8 +1336,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1258
1336
|
return;
|
1259
1337
|
}
|
1260
1338
|
|
1261
|
-
y[ib].
|
1262
|
-
y[ib].
|
1339
|
+
y[ib].ds.x = d;
|
1340
|
+
y[ib].ds.y = sum;
|
1263
1341
|
}
|
1264
1342
|
|
1265
1343
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -1283,363 +1361,1816 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
|
|
1283
1361
|
y[iybs + iqs + y_offset] = v.y;
|
1284
1362
|
}
|
1285
1363
|
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1364
|
+
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
1365
|
+
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
1366
|
+
|
1367
|
+
#define VDR_Q4_0_Q8_1_MMVQ 2
|
1368
|
+
#define VDR_Q4_0_Q8_1_MMQ 4
|
1290
1369
|
|
1291
|
-
|
1292
|
-
|
1293
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1294
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
1370
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
|
1371
|
+
const int * v, const int * u, const float & d4, const half2 & ds8) {
|
1295
1372
|
|
1296
|
-
|
1373
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1374
|
+
int sumi = 0;
|
1297
1375
|
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1376
|
+
#pragma unroll
|
1377
|
+
for (int i = 0; i < vdr; ++i) {
|
1378
|
+
const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
|
1379
|
+
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
1301
1380
|
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
1381
|
+
// SIMD dot product of quantized values
|
1382
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi);
|
1383
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1384
|
+
}
|
1305
1385
|
|
1306
|
-
|
1386
|
+
// second part effectively subtracts 8 from each quant value
|
1387
|
+
return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y));
|
1307
1388
|
#else
|
1308
1389
|
return 0.0f; // only to satisfy the compiler
|
1309
1390
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1310
1391
|
}
|
1311
1392
|
|
1312
|
-
|
1313
|
-
|
1314
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1315
|
-
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1393
|
+
#define VDR_Q4_1_Q8_1_MMVQ 2
|
1394
|
+
#define VDR_Q4_1_Q8_1_MMQ 4
|
1316
1395
|
|
1317
|
-
|
1318
|
-
const int
|
1319
|
-
|
1396
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
|
1397
|
+
const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
|
1398
|
+
|
1399
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1400
|
+
int sumi = 0;
|
1320
1401
|
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1402
|
+
#pragma unroll
|
1403
|
+
for (int i = 0; i < vdr; ++i) {
|
1404
|
+
const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
|
1405
|
+
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
1324
1406
|
|
1325
|
-
|
1326
|
-
|
1407
|
+
// SIMD dot product of quantized values
|
1408
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi);
|
1409
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1410
|
+
}
|
1327
1411
|
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1412
|
+
#ifdef GGML_CUDA_F16
|
1413
|
+
const half2 tmp = __hmul2(dm4, ds8);
|
1414
|
+
const float d4d8 = __half2float(tmp.x);
|
1415
|
+
const float m4s8 = __half2float(tmp.y);
|
1416
|
+
#else
|
1417
|
+
const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
|
1418
|
+
const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
|
1419
|
+
#endif // GGML_CUDA_F16
|
1331
1420
|
|
1332
|
-
|
1421
|
+
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1422
|
+
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1333
1423
|
#else
|
1334
1424
|
return 0.0f; // only to satisfy the compiler
|
1335
1425
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1336
1426
|
}
|
1337
1427
|
|
1338
|
-
|
1339
|
-
|
1428
|
+
#define VDR_Q5_0_Q8_1_MMVQ 2
|
1429
|
+
#define VDR_Q5_0_Q8_1_MMQ 4
|
1430
|
+
|
1431
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
|
1432
|
+
const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
|
1433
|
+
|
1340
1434
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1341
|
-
|
1435
|
+
int sumi = 0;
|
1436
|
+
|
1437
|
+
for (int i = 0; i < vdr; ++i) {
|
1438
|
+
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1439
|
+
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
1440
|
+
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
1441
|
+
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
1442
|
+
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
1443
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
|
1444
|
+
|
1445
|
+
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
1446
|
+
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
1447
|
+
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
1448
|
+
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
1449
|
+
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
1450
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1451
|
+
}
|
1342
1452
|
|
1343
|
-
|
1344
|
-
|
1345
|
-
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1346
|
-
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1347
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1348
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
1349
|
-
|
1350
|
-
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
1351
|
-
|
1352
|
-
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1353
|
-
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1354
|
-
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1355
|
-
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1356
|
-
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1357
|
-
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
1358
|
-
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1359
|
-
|
1360
|
-
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1361
|
-
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1362
|
-
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1363
|
-
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1364
|
-
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1365
|
-
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
1366
|
-
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1367
|
-
|
1368
|
-
return sumi*d;
|
1453
|
+
// second part effectively subtracts 16 from each quant value
|
1454
|
+
return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y));
|
1369
1455
|
#else
|
1370
1456
|
return 0.0f; // only to satisfy the compiler
|
1371
1457
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1372
1458
|
}
|
1373
1459
|
|
1374
|
-
|
1375
|
-
|
1460
|
+
#define VDR_Q5_1_Q8_1_MMVQ 2
|
1461
|
+
#define VDR_Q5_1_Q8_1_MMQ 4
|
1462
|
+
|
1463
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
|
1464
|
+
const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
|
1465
|
+
|
1376
1466
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1377
|
-
|
1467
|
+
int sumi = 0;
|
1468
|
+
|
1469
|
+
for (int i = 0; i < vdr; ++i) {
|
1470
|
+
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1471
|
+
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
1472
|
+
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
1473
|
+
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
1474
|
+
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
1475
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
|
1476
|
+
|
1477
|
+
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
1478
|
+
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
1479
|
+
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
1480
|
+
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
1481
|
+
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
1482
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1483
|
+
}
|
1484
|
+
|
1485
|
+
#ifdef GGML_CUDA_F16
|
1486
|
+
const half2 tmp = __hmul2(dm5, ds8);
|
1487
|
+
const float d5d8 = __half2float(tmp.x);
|
1488
|
+
const float m5s8 = __half2float(tmp.y);
|
1489
|
+
#else
|
1490
|
+
const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
|
1491
|
+
const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
|
1492
|
+
#endif // GGML_CUDA_F16
|
1493
|
+
|
1494
|
+
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
1495
|
+
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1378
1496
|
|
1379
|
-
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
1380
|
-
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1381
|
-
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1382
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1383
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
1384
|
-
|
1385
|
-
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
1386
|
-
const float m = bq5_1->m;
|
1387
|
-
const float s = bq8_1->s;
|
1388
|
-
|
1389
|
-
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1390
|
-
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1391
|
-
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1392
|
-
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1393
|
-
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1394
|
-
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1395
|
-
|
1396
|
-
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1397
|
-
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1398
|
-
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1399
|
-
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1400
|
-
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1401
|
-
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1402
|
-
|
1403
|
-
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1404
1497
|
#else
|
1405
1498
|
return 0.0f; // only to satisfy the compiler
|
1406
1499
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1407
1500
|
}
|
1408
1501
|
|
1409
|
-
|
1410
|
-
|
1411
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1412
|
-
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1502
|
+
#define VDR_Q8_0_Q8_1_MMVQ 2
|
1503
|
+
#define VDR_Q8_0_Q8_1_MMQ 8
|
1413
1504
|
|
1414
|
-
|
1415
|
-
|
1416
|
-
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1505
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
|
1506
|
+
const int * v, const int * u, const float & d8_0, const half2 & ds8_1) {
|
1417
1507
|
|
1418
|
-
|
1508
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1509
|
+
int sumi = 0;
|
1419
1510
|
|
1420
|
-
|
1421
|
-
|
1511
|
+
for (int i = 0; i < vdr; ++i) {
|
1512
|
+
// SIMD dot product of quantized values
|
1513
|
+
sumi = __dp4a(v[i], u[i], sumi);
|
1514
|
+
}
|
1422
1515
|
|
1423
|
-
return sumi*
|
1516
|
+
return sumi * d8_0 * __half2float(ds8_1.x);
|
1424
1517
|
#else
|
1425
1518
|
return 0.0f; // only to satisfy the compiler
|
1426
1519
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1427
1520
|
}
|
1428
1521
|
|
1429
|
-
static __device__ __forceinline__ float
|
1430
|
-
const
|
1522
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
|
1523
|
+
const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
|
1431
1524
|
|
1432
1525
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1433
|
-
|
1526
|
+
int sumi = 0;
|
1434
1527
|
|
1435
|
-
|
1436
|
-
|
1528
|
+
for (int i = 0; i < vdr; ++i) {
|
1529
|
+
// SIMD dot product of quantized values
|
1530
|
+
sumi = __dp4a(v[i], u[i], sumi);
|
1531
|
+
}
|
1437
1532
|
|
1438
|
-
|
1439
|
-
|
1533
|
+
#ifdef GGML_CUDA_F16
|
1534
|
+
const half2 tmp = __hmul2(dm8, ds8);
|
1535
|
+
const float d8d8 = __half2float(tmp.x);
|
1536
|
+
const float m8s8 = __half2float(tmp.y);
|
1537
|
+
#else
|
1538
|
+
const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x);
|
1539
|
+
const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y);
|
1540
|
+
#endif // GGML_CUDA_F16
|
1440
1541
|
|
1441
|
-
|
1442
|
-
|
1542
|
+
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1543
|
+
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1544
|
+
#else
|
1545
|
+
return 0.0f; // only to satisfy the compiler
|
1546
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1547
|
+
}
|
1443
1548
|
|
1444
|
-
|
1549
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1550
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1445
1551
|
|
1446
|
-
|
1447
|
-
|
1552
|
+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1553
|
+
|
1554
|
+
int v[VDR_Q4_0_Q8_1_MMVQ];
|
1555
|
+
int u[2*VDR_Q4_0_Q8_1_MMVQ];
|
1556
|
+
|
1557
|
+
#pragma unroll
|
1558
|
+
for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
|
1559
|
+
v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
|
1560
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1561
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
|
1562
|
+
}
|
1563
|
+
|
1564
|
+
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
1565
|
+
}
|
1566
|
+
|
1567
|
+
static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1568
|
+
|
1569
|
+
__shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
1570
|
+
__shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
|
1571
|
+
|
1572
|
+
*x_ql = tile_x_qs;
|
1573
|
+
*x_dm = (half2 *) tile_x_d;
|
1574
|
+
}
|
1575
|
+
|
1576
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1577
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1578
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1579
|
+
|
1580
|
+
__builtin_assume(i_offset >= 0);
|
1581
|
+
__builtin_assume(i_offset < 8);
|
1582
|
+
__builtin_assume(k >= 0);
|
1583
|
+
__builtin_assume(k < WARP_SIZE);
|
1584
|
+
|
1585
|
+
const int kbx = k / QI4_0;
|
1586
|
+
const int kqsx = k % QI4_0;
|
1587
|
+
|
1588
|
+
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
1589
|
+
|
1590
|
+
float * x_dmf = (float *) x_dm;
|
1591
|
+
|
1592
|
+
#pragma unroll
|
1593
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1594
|
+
int i = i0 + i_offset;
|
1595
|
+
|
1596
|
+
if (need_check) {
|
1597
|
+
i = min(i, i_max);
|
1598
|
+
}
|
1599
|
+
|
1600
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1601
|
+
|
1602
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
1603
|
+
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1604
|
+
}
|
1605
|
+
|
1606
|
+
// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
|
1607
|
+
// const int kbxd = k % blocks_per_tile_x_row;
|
1608
|
+
|
1609
|
+
// #pragma unroll
|
1610
|
+
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
|
1611
|
+
// FIXME out-of-bounds
|
1612
|
+
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1613
|
+
|
1614
|
+
// if (i >= GGML_CUDA_MMQ_Y) {
|
1615
|
+
// return;
|
1616
|
+
// }
|
1617
|
+
|
1618
|
+
// const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1619
|
+
|
1620
|
+
// x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
|
1621
|
+
// }
|
1622
|
+
}
|
1623
|
+
|
1624
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
1625
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1626
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1627
|
+
|
1628
|
+
__builtin_assume(i >= 0);
|
1629
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1630
|
+
__builtin_assume(j >= 0);
|
1631
|
+
__builtin_assume(j < WARP_SIZE);
|
1632
|
+
__builtin_assume(k >= 0);
|
1633
|
+
__builtin_assume(k < WARP_SIZE);
|
1634
|
+
|
1635
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1636
|
+
const float * x_dmf = (float *) x_dm;
|
1637
|
+
|
1638
|
+
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
1639
|
+
|
1640
|
+
#pragma unroll
|
1641
|
+
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
|
1642
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1643
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
|
1644
|
+
}
|
1645
|
+
|
1646
|
+
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
|
1647
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
|
1648
|
+
y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1649
|
+
}
|
1650
|
+
|
1651
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1652
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1653
|
+
|
1654
|
+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1655
|
+
|
1656
|
+
int v[VDR_Q4_1_Q8_1_MMVQ];
|
1657
|
+
int u[2*VDR_Q4_1_Q8_1_MMVQ];
|
1658
|
+
|
1659
|
+
#pragma unroll
|
1660
|
+
for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
|
1661
|
+
v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
|
1662
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1663
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
|
1664
|
+
}
|
1665
|
+
|
1666
|
+
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
1667
|
+
}
|
1668
|
+
|
1669
|
+
static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1670
|
+
|
1671
|
+
__shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
|
1672
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
|
1673
|
+
|
1674
|
+
*x_ql = tile_x_qs;
|
1675
|
+
*x_dm = tile_x_dm;
|
1676
|
+
}
|
1677
|
+
|
1678
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
1679
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1680
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1681
|
+
|
1682
|
+
__builtin_assume(i_offset >= 0);
|
1683
|
+
__builtin_assume(i_offset < 8);
|
1684
|
+
__builtin_assume(k >= 0);
|
1685
|
+
__builtin_assume(k < WARP_SIZE);
|
1686
|
+
|
1687
|
+
const int kbx = k / QI4_1;
|
1688
|
+
const int kqsx = k % QI4_1;
|
1689
|
+
|
1690
|
+
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
1691
|
+
|
1692
|
+
#pragma unroll
|
1693
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1694
|
+
int i = i0 + i_offset;
|
1695
|
+
|
1696
|
+
if (need_check) {
|
1697
|
+
i = min(i, i_max);
|
1698
|
+
}
|
1699
|
+
|
1700
|
+
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
|
1701
|
+
|
1702
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
1703
|
+
}
|
1704
|
+
|
1705
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
|
1706
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1707
|
+
|
1708
|
+
#pragma unroll
|
1709
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
|
1710
|
+
int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
|
1711
|
+
|
1712
|
+
if (need_check) {
|
1713
|
+
i = min(i, i_max);
|
1714
|
+
}
|
1715
|
+
|
1716
|
+
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1717
|
+
|
1718
|
+
x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
|
1719
|
+
}
|
1720
|
+
}
|
1721
|
+
|
1722
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
1723
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1724
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1725
|
+
|
1726
|
+
__builtin_assume(i >= 0);
|
1727
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1728
|
+
__builtin_assume(j >= 0);
|
1729
|
+
__builtin_assume(j < WARP_SIZE);
|
1730
|
+
__builtin_assume(k >= 0);
|
1731
|
+
__builtin_assume(k < WARP_SIZE);
|
1732
|
+
|
1733
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1734
|
+
|
1735
|
+
int u[2*VDR_Q4_1_Q8_1_MMQ];
|
1736
|
+
|
1737
|
+
#pragma unroll
|
1738
|
+
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
|
1739
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1740
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
|
1741
|
+
}
|
1742
|
+
|
1743
|
+
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
|
1744
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
|
1745
|
+
y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1746
|
+
}
|
1747
|
+
|
1748
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1749
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1750
|
+
|
1751
|
+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1752
|
+
|
1753
|
+
int vl[VDR_Q5_0_Q8_1_MMVQ];
|
1754
|
+
int vh[VDR_Q5_0_Q8_1_MMVQ];
|
1755
|
+
int u[2*VDR_Q5_0_Q8_1_MMVQ];
|
1756
|
+
|
1757
|
+
#pragma unroll
|
1758
|
+
for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
|
1759
|
+
vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
|
1760
|
+
vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
|
1761
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1762
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
|
1763
|
+
}
|
1764
|
+
|
1765
|
+
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
|
1766
|
+
}
|
1767
|
+
|
1768
|
+
static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1769
|
+
|
1770
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
1771
|
+
__shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
|
1772
|
+
|
1773
|
+
*x_ql = tile_x_ql;
|
1774
|
+
*x_dm = (half2 *) tile_x_d;
|
1775
|
+
}
|
1776
|
+
|
1777
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
1778
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1779
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1780
|
+
|
1781
|
+
__builtin_assume(i_offset >= 0);
|
1782
|
+
__builtin_assume(i_offset < 8);
|
1783
|
+
__builtin_assume(k >= 0);
|
1784
|
+
__builtin_assume(k < WARP_SIZE);
|
1785
|
+
|
1786
|
+
const int kbx = k / QI5_0;
|
1787
|
+
const int kqsx = k % QI5_0;
|
1788
|
+
|
1789
|
+
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
1790
|
+
|
1791
|
+
#pragma unroll
|
1792
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1793
|
+
int i = i0 + i_offset;
|
1794
|
+
|
1795
|
+
if (need_check) {
|
1796
|
+
i = min(i, i_max);
|
1797
|
+
}
|
1798
|
+
|
1799
|
+
const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1800
|
+
|
1801
|
+
const int ql = get_int_from_uint8(bxi->qs, kqsx);
|
1802
|
+
const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
|
1803
|
+
|
1804
|
+
int qs0 = (ql >> 0) & 0x0F0F0F0F;
|
1805
|
+
qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
|
1806
|
+
qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
|
1807
|
+
qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
|
1808
|
+
qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
|
1809
|
+
qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
|
1810
|
+
|
1811
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
|
1812
|
+
|
1813
|
+
int qs1 = (ql >> 4) & 0x0F0F0F0F;
|
1814
|
+
qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
|
1815
|
+
qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
|
1816
|
+
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
1817
|
+
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
1818
|
+
qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
|
1819
|
+
|
1820
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
|
1821
|
+
}
|
1822
|
+
|
1823
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
|
1824
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1825
|
+
float * x_dmf = (float *) x_dm;
|
1826
|
+
|
1827
|
+
#pragma unroll
|
1828
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
|
1829
|
+
int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
|
1830
|
+
|
1831
|
+
if (need_check) {
|
1832
|
+
i = min(i, i_max);
|
1833
|
+
}
|
1834
|
+
|
1835
|
+
const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1836
|
+
|
1837
|
+
x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
|
1838
|
+
}
|
1839
|
+
}
|
1840
|
+
|
1841
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
1842
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1843
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1844
|
+
|
1845
|
+
__builtin_assume(i >= 0);
|
1846
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1847
|
+
__builtin_assume(j >= 0);
|
1848
|
+
__builtin_assume(j < WARP_SIZE);
|
1849
|
+
__builtin_assume(k >= 0);
|
1850
|
+
__builtin_assume(k < WARP_SIZE);
|
1851
|
+
|
1852
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1853
|
+
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
1854
|
+
const float * x_dmf = (float *) x_dm;
|
1855
|
+
|
1856
|
+
int u[2*VDR_Q5_0_Q8_1_MMQ];
|
1857
|
+
|
1858
|
+
#pragma unroll
|
1859
|
+
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
|
1860
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1861
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
|
1865
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1866
|
+
}
|
1867
|
+
|
1868
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1869
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1870
|
+
|
1871
|
+
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1872
|
+
|
1873
|
+
int vl[VDR_Q5_1_Q8_1_MMVQ];
|
1874
|
+
int vh[VDR_Q5_1_Q8_1_MMVQ];
|
1875
|
+
int u[2*VDR_Q5_1_Q8_1_MMVQ];
|
1876
|
+
|
1877
|
+
#pragma unroll
|
1878
|
+
for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
|
1879
|
+
vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
|
1880
|
+
vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
|
1881
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1882
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
|
1883
|
+
}
|
1884
|
+
|
1885
|
+
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
|
1886
|
+
}
|
1887
|
+
|
1888
|
+
static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1889
|
+
|
1890
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
1891
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
|
1892
|
+
|
1893
|
+
*x_ql = tile_x_ql;
|
1894
|
+
*x_dm = tile_x_dm;
|
1895
|
+
}
|
1896
|
+
|
1897
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
1898
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1899
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1900
|
+
|
1901
|
+
__builtin_assume(i_offset >= 0);
|
1902
|
+
__builtin_assume(i_offset < 8);
|
1903
|
+
__builtin_assume(k >= 0);
|
1904
|
+
__builtin_assume(k < WARP_SIZE);
|
1905
|
+
|
1906
|
+
const int kbx = k / QI5_1;
|
1907
|
+
const int kqsx = k % QI5_1;
|
1908
|
+
|
1909
|
+
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
1910
|
+
|
1911
|
+
#pragma unroll
|
1912
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1913
|
+
int i = i0 + i_offset;
|
1914
|
+
|
1915
|
+
if (need_check) {
|
1916
|
+
i = min(i, i_max);
|
1917
|
+
}
|
1918
|
+
|
1919
|
+
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
|
1920
|
+
|
1921
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
1922
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
|
1923
|
+
|
1924
|
+
int qs0 = (ql >> 0) & 0x0F0F0F0F;
|
1925
|
+
qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
|
1926
|
+
qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
|
1927
|
+
qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
|
1928
|
+
qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
|
1929
|
+
|
1930
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
|
1931
|
+
|
1932
|
+
int qs1 = (ql >> 4) & 0x0F0F0F0F;
|
1933
|
+
qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
|
1934
|
+
qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
|
1935
|
+
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
1936
|
+
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
1937
|
+
|
1938
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
|
1939
|
+
}
|
1940
|
+
|
1941
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
|
1942
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1943
|
+
|
1944
|
+
#pragma unroll
|
1945
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
|
1946
|
+
int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
|
1947
|
+
|
1948
|
+
if (need_check) {
|
1949
|
+
i = min(i, i_max);
|
1950
|
+
}
|
1951
|
+
|
1952
|
+
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1953
|
+
|
1954
|
+
x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
|
1955
|
+
}
|
1956
|
+
}
|
1957
|
+
|
1958
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
1959
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1960
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1961
|
+
|
1962
|
+
__builtin_assume(i >= 0);
|
1963
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1964
|
+
__builtin_assume(j >= 0);
|
1965
|
+
__builtin_assume(j < WARP_SIZE);
|
1966
|
+
__builtin_assume(k >= 0);
|
1967
|
+
__builtin_assume(k < WARP_SIZE);
|
1968
|
+
|
1969
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1970
|
+
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
1971
|
+
|
1972
|
+
int u[2*VDR_Q5_1_Q8_1_MMQ];
|
1973
|
+
|
1974
|
+
#pragma unroll
|
1975
|
+
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
|
1976
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1977
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
|
1978
|
+
}
|
1979
|
+
|
1980
|
+
return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
|
1981
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1982
|
+
}
|
1983
|
+
|
1984
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1985
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1986
|
+
|
1987
|
+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1988
|
+
|
1989
|
+
int v[VDR_Q8_0_Q8_1_MMVQ];
|
1990
|
+
int u[VDR_Q8_0_Q8_1_MMVQ];
|
1991
|
+
|
1992
|
+
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
|
1993
|
+
v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
|
1994
|
+
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1995
|
+
}
|
1996
|
+
|
1997
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
|
1998
|
+
}
|
1999
|
+
|
2000
|
+
static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2001
|
+
|
2002
|
+
__shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2003
|
+
__shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
|
2004
|
+
|
2005
|
+
*x_ql = tile_x_qs;
|
2006
|
+
*x_dm = (half2 *) tile_x_d;
|
2007
|
+
}
|
2008
|
+
|
2009
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2010
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2011
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2012
|
+
|
2013
|
+
__builtin_assume(i_offset >= 0);
|
2014
|
+
__builtin_assume(i_offset < 8);
|
2015
|
+
__builtin_assume(k >= 0);
|
2016
|
+
__builtin_assume(k < WARP_SIZE);
|
2017
|
+
|
2018
|
+
const int kbx = k / QI8_0;
|
2019
|
+
const int kqsx = k % QI8_0;
|
2020
|
+
float * x_dmf = (float *) x_dm;
|
2021
|
+
|
2022
|
+
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2023
|
+
|
2024
|
+
#pragma unroll
|
2025
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2026
|
+
int i = i0 + i_offset;
|
2027
|
+
|
2028
|
+
if (need_check) {
|
2029
|
+
i = min(i, i_max);
|
2030
|
+
}
|
2031
|
+
|
2032
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
2033
|
+
|
2034
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
|
2035
|
+
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
|
2036
|
+
}
|
2037
|
+
|
2038
|
+
// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
|
2039
|
+
// const int kbxd = k % blocks_per_tile_x_row;
|
2040
|
+
|
2041
|
+
// #pragma unroll
|
2042
|
+
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
|
2043
|
+
// FIXME out-of-bounds
|
2044
|
+
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2045
|
+
|
2046
|
+
// #if GGML_CUDA_MMQ_Y < 64
|
2047
|
+
// if (i >= GGML_CUDA_MMQ_Y) {
|
2048
|
+
// return;
|
2049
|
+
// }
|
2050
|
+
// #endif // GGML_CUDA_MMQ_Y < 64
|
2051
|
+
|
2052
|
+
// const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2053
|
+
|
2054
|
+
// x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
|
2055
|
+
// }
|
2056
|
+
}
|
2057
|
+
|
2058
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2059
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2060
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2061
|
+
|
2062
|
+
__builtin_assume(i >= 0);
|
2063
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2064
|
+
__builtin_assume(j >= 0);
|
2065
|
+
__builtin_assume(j < WARP_SIZE);
|
2066
|
+
__builtin_assume(k >= 0);
|
2067
|
+
__builtin_assume(k < WARP_SIZE);
|
2068
|
+
|
2069
|
+
const float * x_dmf = (float *) x_dm;
|
2070
|
+
|
2071
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
|
2072
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
|
2073
|
+
y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
|
2074
|
+
}
|
2075
|
+
|
2076
|
+
#define VDR_q2_K_q8_1 1
|
2077
|
+
|
2078
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
|
2079
|
+
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2080
|
+
const half2 & dm, const float * __restrict__ d8) {
|
2081
|
+
|
2082
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2083
|
+
float sumf_d = 0.0f;
|
2084
|
+
float sumf_m = 0.0f;
|
2085
|
+
|
2086
|
+
for (int i = 0; i < QR2_K; ++i) {
|
2087
|
+
const int sc = scales[2*i];
|
2088
|
+
|
2089
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
2090
|
+
|
2091
|
+
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
2092
|
+
|
2093
|
+
int sc_high = sc >> 4;
|
2094
|
+
sc_high |= sc_high << 8;
|
2095
|
+
sc_high |= sc_high << 16;
|
2096
|
+
sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
2097
|
+
}
|
2098
|
+
|
2099
|
+
const float2 dmf = __half22float2(dm);
|
2100
|
+
|
2101
|
+
return dmf.x*sumf_d - dmf.y*sumf_m;
|
2102
|
+
#else
|
2103
|
+
return 0.0f; // only to satisfy the compiler
|
2104
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2105
|
+
}
|
2106
|
+
|
2107
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
2108
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2109
|
+
|
2110
|
+
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
2111
|
+
|
2112
|
+
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
2113
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
2114
|
+
|
2115
|
+
const uint8_t * scales = bq2_K->scales + scale_offset;
|
2116
|
+
|
2117
|
+
const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
|
2118
|
+
int u[QR2_K];
|
2119
|
+
float d8[QR2_K];
|
2120
|
+
|
2121
|
+
for (int i = 0; i < QR2_K; ++ i) {
|
2122
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2123
|
+
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2124
|
+
}
|
2125
|
+
|
2126
|
+
return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8);
|
2127
|
+
}
|
2128
|
+
|
2129
|
+
static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2130
|
+
|
2131
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2132
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
|
2133
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
|
2134
|
+
|
2135
|
+
*x_ql = tile_x_ql;
|
2136
|
+
*x_dm = tile_x_dm;
|
2137
|
+
*x_sc = tile_x_sc;
|
2138
|
+
}
|
2139
|
+
|
2140
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2141
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2142
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2143
|
+
|
2144
|
+
__builtin_assume(i_offset >= 0);
|
2145
|
+
__builtin_assume(i_offset < 8);
|
2146
|
+
__builtin_assume(k >= 0);
|
2147
|
+
__builtin_assume(k < WARP_SIZE);
|
2148
|
+
|
2149
|
+
const int kbx = k / QI2_K;
|
2150
|
+
const int kqsx = k % QI2_K;
|
2151
|
+
|
2152
|
+
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2153
|
+
|
2154
|
+
#pragma unroll
|
2155
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2156
|
+
int i = i0 + i_offset;
|
2157
|
+
|
2158
|
+
if (need_check) {
|
2159
|
+
i = min(i, i_max);
|
2160
|
+
}
|
2161
|
+
|
2162
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2163
|
+
|
2164
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2165
|
+
}
|
2166
|
+
|
2167
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
|
2168
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2169
|
+
|
2170
|
+
#pragma unroll
|
2171
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
|
2172
|
+
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2173
|
+
|
2174
|
+
if (need_check) {
|
2175
|
+
i = min(i, i_max);
|
2176
|
+
}
|
2177
|
+
|
2178
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2179
|
+
|
2180
|
+
x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
|
2181
|
+
}
|
2182
|
+
|
2183
|
+
#pragma unroll
|
2184
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
|
2185
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2186
|
+
|
2187
|
+
if (need_check) {
|
2188
|
+
i = min(i, i_max);
|
2189
|
+
}
|
2190
|
+
|
2191
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
|
2192
|
+
|
2193
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
|
2194
|
+
}
|
2195
|
+
}
|
2196
|
+
|
2197
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
2198
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2199
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2200
|
+
|
2201
|
+
__builtin_assume(i >= 0);
|
2202
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2203
|
+
__builtin_assume(j >= 0);
|
2204
|
+
__builtin_assume(j < WARP_SIZE);
|
2205
|
+
__builtin_assume(k >= 0);
|
2206
|
+
__builtin_assume(k < WARP_SIZE);
|
2207
|
+
|
2208
|
+
const int kbx = k / QI2_K;
|
2209
|
+
const int kqsx = k % QI2_K;
|
2210
|
+
|
2211
|
+
const int bq8_offset = QR2_K * (kqsx / QI8_1);
|
2212
|
+
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2213
|
+
|
2214
|
+
const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset;
|
2215
|
+
|
2216
|
+
int u[QR2_K];
|
2217
|
+
float d8[QR2_K];
|
2218
|
+
|
2219
|
+
for (int l = 0; l < QR2_K; ++ l) {
|
2220
|
+
const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2221
|
+
u[l] = y_qs[y_qs_index];
|
2222
|
+
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2223
|
+
}
|
2224
|
+
|
2225
|
+
return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8);
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
#define VDR_q3_K_q8_1 1
|
2229
|
+
|
2230
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
|
2231
|
+
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2232
|
+
const int & scale_offset, const float & d, const float * __restrict__ d8) {
|
2233
|
+
|
2234
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2235
|
+
float sumf = 0.0f;
|
2236
|
+
|
2237
|
+
for (int i = 0; i < QR3_K; ++i) {
|
2238
|
+
const int isc = scale_offset + 2*i;
|
2239
|
+
|
2240
|
+
const int isc_low = isc % (QK_K/32);
|
2241
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
2242
|
+
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
2243
|
+
|
2244
|
+
const int isc_high = isc % (QK_K/64);
|
2245
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
2246
|
+
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
2247
|
+
|
2248
|
+
const int sc = (sc_low | sc_high) - 32;
|
2249
|
+
|
2250
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
2251
|
+
|
2252
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
2253
|
+
|
2254
|
+
const int vi = __vsubss4(vil, vih);
|
2255
|
+
|
2256
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
2257
|
+
}
|
2258
|
+
|
2259
|
+
return d*sumf;
|
2260
|
+
#else
|
2261
|
+
return 0.0f; // only to satisfy the compiler
|
2262
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2263
|
+
}
|
2264
|
+
|
2265
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
2266
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2267
|
+
|
2268
|
+
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
2269
|
+
|
2270
|
+
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
2271
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
2272
|
+
|
2273
|
+
const float d = bq3_K->d;
|
2274
|
+
|
2275
|
+
const int vl = get_int_from_uint8(bq3_K->qs, iqs);
|
2276
|
+
|
2277
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2278
|
+
const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
|
2279
|
+
|
2280
|
+
int u[QR3_K];
|
2281
|
+
float d8[QR3_K];
|
2282
|
+
|
2283
|
+
for (int i = 0; i < QR3_K; ++i) {
|
2284
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2285
|
+
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2286
|
+
}
|
2287
|
+
|
2288
|
+
return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
2289
|
+
}
|
2290
|
+
|
2291
|
+
static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2292
|
+
|
2293
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2294
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
|
2295
|
+
__shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
|
2296
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
|
2297
|
+
|
2298
|
+
*x_ql = tile_x_ql;
|
2299
|
+
*x_dm = tile_x_dm;
|
2300
|
+
*x_qh = tile_x_qh;
|
2301
|
+
*x_sc = tile_x_sc;
|
2302
|
+
}
|
2303
|
+
|
2304
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2305
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2306
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2307
|
+
|
2308
|
+
__builtin_assume(i_offset >= 0);
|
2309
|
+
__builtin_assume(i_offset < 8);
|
2310
|
+
__builtin_assume(k >= 0);
|
2311
|
+
__builtin_assume(k < WARP_SIZE);
|
2312
|
+
|
2313
|
+
const int kbx = k / QI3_K;
|
2314
|
+
const int kqsx = k % QI3_K;
|
2315
|
+
|
2316
|
+
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2317
|
+
|
2318
|
+
#pragma unroll
|
2319
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2320
|
+
int i = i0 + i_offset;
|
2321
|
+
|
2322
|
+
if (need_check) {
|
2323
|
+
i = min(i, i_max);
|
2324
|
+
}
|
2325
|
+
|
2326
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2327
|
+
|
2328
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
|
2332
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2333
|
+
|
2334
|
+
#pragma unroll
|
2335
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
|
2336
|
+
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2337
|
+
|
2338
|
+
if (need_check) {
|
2339
|
+
i = min(i, i_max);
|
2340
|
+
}
|
2341
|
+
|
2342
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2343
|
+
|
2344
|
+
x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d;
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
#pragma unroll
|
2348
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
|
2349
|
+
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
2350
|
+
|
2351
|
+
if (need_check) {
|
2352
|
+
i = min(i, i_max);
|
2353
|
+
}
|
2354
|
+
|
2355
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
|
2356
|
+
|
2357
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
|
2358
|
+
}
|
2359
|
+
|
2360
|
+
#pragma unroll
|
2361
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
|
2362
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2363
|
+
|
2364
|
+
if (need_check) {
|
2365
|
+
i = min(i, i_max);
|
2366
|
+
}
|
2367
|
+
|
2368
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
|
2369
|
+
|
2370
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4));
|
2371
|
+
}
|
2372
|
+
}
|
2373
|
+
|
2374
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
2375
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2376
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2377
|
+
|
2378
|
+
__builtin_assume(i >= 0);
|
2379
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2380
|
+
__builtin_assume(j >= 0);
|
2381
|
+
__builtin_assume(j < WARP_SIZE);
|
2382
|
+
__builtin_assume(k >= 0);
|
2383
|
+
__builtin_assume(k < WARP_SIZE);
|
2384
|
+
|
2385
|
+
const int kbx = k / QI3_K;
|
2386
|
+
const int kqsx = k % QI3_K;
|
2387
|
+
|
2388
|
+
const int bq8_offset = QR3_K * (kqsx / (QI3_K/2));
|
2389
|
+
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2390
|
+
|
2391
|
+
const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
|
2392
|
+
|
2393
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2394
|
+
const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
|
2395
|
+
|
2396
|
+
int u[QR3_K];
|
2397
|
+
float d8[QR3_K];
|
2398
|
+
|
2399
|
+
for (int l = 0; l < QR3_K; ++ l) {
|
2400
|
+
const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2401
|
+
u[l] = y_qs[y_qs_index];
|
2402
|
+
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2403
|
+
}
|
2404
|
+
|
2405
|
+
return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
|
2406
|
+
x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
|
2407
|
+
}
|
2408
|
+
|
2409
|
+
#define VDR_q4_K_q8_1 2
|
2410
|
+
|
2411
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
|
2412
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2413
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
2414
|
+
|
2415
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2416
|
+
float sumf_d = 0.0f;
|
2417
|
+
float sumf_m = 0.0f;
|
2418
|
+
|
2419
|
+
for (int i = 0; i < QR4_K; ++i) {
|
2420
|
+
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
|
2421
|
+
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
|
2422
|
+
|
2423
|
+
const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
|
2424
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
|
2425
|
+
|
2426
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
2427
|
+
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
2428
|
+
}
|
2429
|
+
|
2430
|
+
return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m;
|
2431
|
+
|
2432
|
+
#else
|
2433
|
+
return 0.0f; // only to satisfy the compiler
|
2434
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2435
|
+
}
|
2436
|
+
|
2437
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
2438
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2439
|
+
|
2440
|
+
#ifndef GGML_QKK_64
|
2441
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
2442
|
+
|
2443
|
+
int v[2];
|
2444
|
+
int u[2*QR4_K];
|
2445
|
+
float d8[QR4_K];
|
2446
|
+
|
2447
|
+
// iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
|
2448
|
+
const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
|
2449
|
+
|
2450
|
+
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
2451
|
+
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
2452
|
+
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
2453
|
+
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
2454
|
+
|
2455
|
+
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
|
2456
|
+
v[0] = q4[0];
|
2457
|
+
v[1] = q4[4];
|
2458
|
+
|
2459
|
+
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
2460
|
+
uint16_t aux[2];
|
2461
|
+
const int j = bq8_offset/2;
|
2462
|
+
if (j < 2) {
|
2463
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
2464
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
2465
|
+
} else {
|
2466
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
2467
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
2468
|
+
}
|
2469
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2470
|
+
const uint8_t * m = sc + 2;
|
2471
|
+
|
2472
|
+
for (int i = 0; i < QR4_K; ++i) {
|
2473
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2474
|
+
d8[i] = bq8i->ds.x;
|
2475
|
+
|
2476
|
+
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2477
|
+
u[2*i+0] = q8[0];
|
2478
|
+
u[2*i+1] = q8[4];
|
2479
|
+
}
|
2480
|
+
|
2481
|
+
return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8);
|
2482
|
+
|
2483
|
+
#else
|
2484
|
+
|
2485
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2486
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
2487
|
+
|
2488
|
+
float sumf_d = 0.0f;
|
2489
|
+
float sumf_m = 0.0f;
|
2490
|
+
|
2491
|
+
uint16_t aux16[2];
|
2492
|
+
const uint8_t * s = (const uint8_t *)aux16;
|
2493
|
+
|
2494
|
+
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
2495
|
+
aux16[0] = a[0] & 0x0f0f;
|
2496
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2497
|
+
|
2498
|
+
const float dall = bq4_K->d[0];
|
2499
|
+
const float dmin = bq4_K->d[1];
|
2500
|
+
|
2501
|
+
const float d8_1 = bq8_1[0].ds.x;
|
2502
|
+
const float d8_2 = bq8_1[1].ds.x;
|
2503
|
+
|
2504
|
+
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2505
|
+
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
2506
|
+
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
2507
|
+
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
2508
|
+
|
2509
|
+
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
2510
|
+
const int v1 = q4[0];
|
2511
|
+
const int v2 = q4[4];
|
2512
|
+
|
2513
|
+
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
2514
|
+
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
2515
|
+
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
2516
|
+
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
2517
|
+
|
2518
|
+
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
2519
|
+
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
2520
|
+
|
2521
|
+
return dall * sumf_d - dmin * sumf_m;
|
2522
|
+
|
2523
|
+
#else
|
2524
|
+
return 0.0f; // only to satisfy the compiler
|
2525
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2526
|
+
|
2527
|
+
#endif
|
2528
|
+
}
|
2529
|
+
|
2530
|
+
static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2531
|
+
|
2532
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2533
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
|
2534
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
2535
|
+
|
2536
|
+
*x_ql = tile_x_ql;
|
2537
|
+
*x_dm = tile_x_dm;
|
2538
|
+
*x_sc = tile_x_sc;
|
2539
|
+
}
|
2540
|
+
|
2541
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2542
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2543
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2544
|
+
|
2545
|
+
__builtin_assume(i_offset >= 0);
|
2546
|
+
__builtin_assume(i_offset < 8);
|
2547
|
+
__builtin_assume(k >= 0);
|
2548
|
+
__builtin_assume(k < WARP_SIZE);
|
2549
|
+
|
2550
|
+
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
2551
|
+
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
2552
|
+
|
2553
|
+
const block_q4_K * bx0 = (block_q4_K *) vx;
|
2554
|
+
|
2555
|
+
#pragma unroll
|
2556
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2557
|
+
int i = i0 + i_offset;
|
2558
|
+
|
2559
|
+
if (need_check) {
|
2560
|
+
i = min(i, i_max);
|
2561
|
+
}
|
2562
|
+
|
2563
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2564
|
+
|
2565
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2566
|
+
}
|
2567
|
+
|
2568
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
2569
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2570
|
+
|
2571
|
+
#pragma unroll
|
2572
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
|
2573
|
+
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2574
|
+
|
2575
|
+
if (need_check) {
|
2576
|
+
i = min(i, i_max);
|
2577
|
+
}
|
2578
|
+
|
2579
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2580
|
+
|
2581
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2582
|
+
}
|
2583
|
+
|
2584
|
+
#pragma unroll
|
2585
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2586
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
2587
|
+
|
2588
|
+
if (need_check) {
|
2589
|
+
i = min(i, i_max);
|
2590
|
+
}
|
2591
|
+
|
2592
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
2593
|
+
|
2594
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
|
2595
|
+
}
|
2596
|
+
}
|
2597
|
+
|
2598
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2599
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2600
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2601
|
+
|
2602
|
+
__builtin_assume(i >= 0);
|
2603
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2604
|
+
__builtin_assume(j >= 0);
|
2605
|
+
__builtin_assume(j < WARP_SIZE);
|
2606
|
+
__builtin_assume(k >= 0);
|
2607
|
+
__builtin_assume(k < WARP_SIZE);
|
2608
|
+
|
2609
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2610
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2611
|
+
|
2612
|
+
int v[2];
|
2613
|
+
int u[2*QR4_K];
|
2614
|
+
float d8[QR4_K];
|
2615
|
+
|
2616
|
+
// kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6
|
2617
|
+
const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
|
2618
|
+
|
2619
|
+
v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
|
2620
|
+
v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
|
2621
|
+
|
2622
|
+
const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
|
2623
|
+
uint16_t aux[2];
|
2624
|
+
const int l = bq8_offset/2;
|
2625
|
+
if (l < 2) {
|
2626
|
+
aux[0] = scales[l+0] & 0x3f3f;
|
2627
|
+
aux[1] = scales[l+2] & 0x3f3f;
|
2628
|
+
} else {
|
2629
|
+
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2630
|
+
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2631
|
+
}
|
2632
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2633
|
+
const uint8_t * m = sc + 2;
|
2634
|
+
|
2635
|
+
for (int l = 0; l < QR4_K; ++l) {
|
2636
|
+
const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2637
|
+
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2638
|
+
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2639
|
+
d8[l] = y_ds[kqsy / QI8_1].x;
|
2640
|
+
}
|
2641
|
+
|
2642
|
+
return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
|
2643
|
+
}
|
2644
|
+
|
2645
|
+
#define VDR_q5_K_q8_1 2
|
2646
|
+
|
2647
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
2648
|
+
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2649
|
+
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
2650
|
+
|
2651
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2652
|
+
float sumf_d = 0.0f;
|
2653
|
+
float sumf_m = 0.0f;
|
2654
|
+
|
2655
|
+
for (int i = 0; i < QR5_K; ++i) {
|
2656
|
+
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
2657
|
+
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
2658
|
+
|
2659
|
+
const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
|
2660
|
+
const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
|
2661
|
+
|
2662
|
+
const int v0i = vl0i | vh0i;
|
2663
|
+
const int v1i = vl1i | vh1i;
|
2664
|
+
|
2665
|
+
const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
2666
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
2667
|
+
|
2668
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
2669
|
+
sumf_m += d8[i] * (dot2 * m[i]);
|
2670
|
+
|
2671
|
+
}
|
2672
|
+
|
2673
|
+
return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m;
|
2674
|
+
|
2675
|
+
#else
|
2676
|
+
return 0.0f; // only to satisfy the compiler
|
2677
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2678
|
+
}
|
2679
|
+
|
2680
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
2681
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2682
|
+
|
2683
|
+
#ifndef GGML_QKK_64
|
2684
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
2685
|
+
|
2686
|
+
int vl[2];
|
2687
|
+
int vh[2];
|
2688
|
+
int u[2*QR5_K];
|
2689
|
+
float d8[QR5_K];
|
2690
|
+
|
2691
|
+
const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
|
2692
|
+
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
|
2693
|
+
const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
|
2694
|
+
|
2695
|
+
vl[0] = ql[0];
|
2696
|
+
vl[1] = ql[4];
|
2697
|
+
|
2698
|
+
vh[0] = qh[0] >> bq8_offset;
|
2699
|
+
vh[1] = qh[4] >> bq8_offset;
|
2700
|
+
|
2701
|
+
const uint16_t * scales = (const uint16_t *)bq5_K->scales;
|
2702
|
+
uint16_t aux[2];
|
2703
|
+
const int j = bq8_offset/2;
|
2704
|
+
if (j < 2) {
|
2705
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
2706
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
2707
|
+
} else {
|
2708
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
2709
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
2710
|
+
}
|
2711
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2712
|
+
const uint8_t * m = sc + 2;
|
2713
|
+
|
2714
|
+
for (int i = 0; i < QR5_K; ++i) {
|
2715
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2716
|
+
d8[i] = bq8i->ds.x;
|
2717
|
+
|
2718
|
+
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2719
|
+
u[2*i+0] = q8[0];
|
2720
|
+
u[2*i+1] = q8[4];
|
2721
|
+
}
|
2722
|
+
|
2723
|
+
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
|
2724
|
+
|
2725
|
+
#else
|
2726
|
+
|
2727
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2728
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
2729
|
+
|
2730
|
+
const int8_t * s = bq5_K->scales;
|
2731
|
+
|
2732
|
+
const float d = bq5_K->d;
|
2733
|
+
|
2734
|
+
const float d8_1 = bq8_1[0].ds.x;
|
2735
|
+
const float d8_2 = bq8_1[1].ds.x;
|
2736
|
+
|
2737
|
+
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2738
|
+
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
2739
|
+
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
2740
|
+
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
2741
|
+
|
2742
|
+
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
2743
|
+
const int vl1 = ql[0];
|
2744
|
+
const int vl2 = ql[4];
|
2745
|
+
|
2746
|
+
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
2747
|
+
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
2748
|
+
const int in = step%8; // 0, 4, 0, 4
|
2749
|
+
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
2750
|
+
|
2751
|
+
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
2752
|
+
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
2753
|
+
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
2754
|
+
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
2755
|
+
|
2756
|
+
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
2757
|
+
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
2758
|
+
|
2759
|
+
return d * sumf_d;
|
2760
|
+
|
2761
|
+
#else
|
2762
|
+
return 0.0f; // only to satisfy the compiler
|
2763
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2764
|
+
|
2765
|
+
#endif
|
2766
|
+
}
|
2767
|
+
|
2768
|
+
static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2769
|
+
|
2770
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2771
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
|
2772
|
+
__shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
|
2773
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
2774
|
+
|
2775
|
+
*x_ql = tile_x_ql;
|
2776
|
+
*x_dm = tile_x_dm;
|
2777
|
+
*x_qh = tile_x_qh;
|
2778
|
+
*x_sc = tile_x_sc;
|
2779
|
+
}
|
2780
|
+
|
2781
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2782
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2783
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2784
|
+
|
2785
|
+
__builtin_assume(i_offset >= 0);
|
2786
|
+
__builtin_assume(i_offset < 8);
|
2787
|
+
__builtin_assume(k >= 0);
|
2788
|
+
__builtin_assume(k < WARP_SIZE);
|
2789
|
+
|
2790
|
+
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
2791
|
+
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
2792
|
+
|
2793
|
+
const block_q5_K * bx0 = (block_q5_K *) vx;
|
2794
|
+
|
2795
|
+
#pragma unroll
|
2796
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2797
|
+
int i = i0 + i_offset;
|
2798
|
+
|
2799
|
+
if (need_check) {
|
2800
|
+
i = min(i, i_max);
|
2801
|
+
}
|
2802
|
+
|
2803
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2804
|
+
|
2805
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2806
|
+
}
|
2807
|
+
|
2808
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
2809
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2810
|
+
|
2811
|
+
#pragma unroll
|
2812
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
|
2813
|
+
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2814
|
+
|
2815
|
+
if (need_check) {
|
2816
|
+
i = min(i, i_max);
|
2817
|
+
}
|
2818
|
+
|
2819
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2820
|
+
|
2821
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
#pragma unroll
|
2825
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
|
2826
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2827
|
+
|
2828
|
+
if (need_check) {
|
2829
|
+
i = min(i, i_max);
|
2830
|
+
}
|
2831
|
+
|
2832
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
|
2833
|
+
|
2834
|
+
x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
|
2835
|
+
}
|
2836
|
+
|
2837
|
+
#pragma unroll
|
2838
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2839
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
2840
|
+
|
2841
|
+
if (need_check) {
|
2842
|
+
i = min(i, i_max);
|
2843
|
+
}
|
2844
|
+
|
2845
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
2846
|
+
|
2847
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
|
2848
|
+
}
|
2849
|
+
}
|
2850
|
+
|
2851
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
2852
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2853
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2854
|
+
|
2855
|
+
__builtin_assume(i >= 0);
|
2856
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2857
|
+
__builtin_assume(j >= 0);
|
2858
|
+
__builtin_assume(j < WARP_SIZE);
|
2859
|
+
__builtin_assume(k >= 0);
|
2860
|
+
__builtin_assume(k < WARP_SIZE);
|
2861
|
+
|
2862
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2863
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2864
|
+
|
2865
|
+
int vl[2];
|
2866
|
+
int vh[2];
|
2867
|
+
int u[2*QR4_K];
|
2868
|
+
float d8[QR4_K];
|
2869
|
+
|
2870
|
+
const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
|
2871
|
+
|
2872
|
+
vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
|
2873
|
+
vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
|
2874
|
+
|
2875
|
+
vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
|
2876
|
+
vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
|
2877
|
+
|
2878
|
+
const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
|
2879
|
+
uint16_t aux[2];
|
2880
|
+
const int l = bq8_offset/2;
|
2881
|
+
if (l < 2) {
|
2882
|
+
aux[0] = scales[l+0] & 0x3f3f;
|
2883
|
+
aux[1] = scales[l+2] & 0x3f3f;
|
2884
|
+
} else {
|
2885
|
+
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2886
|
+
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2887
|
+
}
|
2888
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2889
|
+
const uint8_t * m = sc + 2;
|
2890
|
+
|
2891
|
+
for (int l = 0; l < QR5_K; ++l) {
|
2892
|
+
const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2893
|
+
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2894
|
+
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2895
|
+
d8[l] = y_ds[kqsy / QI8_1].x;
|
2896
|
+
}
|
2897
|
+
|
2898
|
+
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
|
2899
|
+
}
|
2900
|
+
|
2901
|
+
#define VDR_q6_K_q8_1 1
|
2902
|
+
|
2903
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
|
2904
|
+
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
2905
|
+
const float & d, const float * __restrict__ d8) {
|
2906
|
+
|
2907
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2908
|
+
float sumf = 0.0f;
|
2909
|
+
|
2910
|
+
for (int i = 0; i < QR6_K; ++i) {
|
2911
|
+
const int sc = scales[4*i];
|
2912
|
+
|
2913
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
2914
|
+
|
2915
|
+
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
2916
|
+
|
2917
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
2918
|
+
|
2919
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
2920
|
+
}
|
2921
|
+
|
2922
|
+
return d*sumf;
|
2923
|
+
#else
|
2924
|
+
return 0.0f; // only to satisfy the compiler
|
2925
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2926
|
+
}
|
2927
|
+
|
2928
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
2929
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2930
|
+
|
2931
|
+
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
2932
|
+
|
2933
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
2934
|
+
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
2935
|
+
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
2936
|
+
|
2937
|
+
const int vl = get_int_from_uint8(bq6_K->ql, iqs);
|
2938
|
+
const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
|
2939
|
+
|
2940
|
+
const int8_t * scales = bq6_K->scales + scale_offset;
|
2941
|
+
|
2942
|
+
int u[QR6_K];
|
2943
|
+
float d8[QR6_K];
|
2944
|
+
|
2945
|
+
for (int i = 0; i < QR6_K; ++i) {
|
2946
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
2947
|
+
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
2948
|
+
}
|
1448
2949
|
|
1449
|
-
|
1450
|
-
|
2950
|
+
return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8);
|
2951
|
+
}
|
1451
2952
|
|
1452
|
-
|
1453
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
2953
|
+
static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1454
2954
|
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
2955
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2956
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
|
2957
|
+
__shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
|
2958
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
1458
2959
|
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
2960
|
+
*x_ql = tile_x_ql;
|
2961
|
+
*x_dm = tile_x_dm;
|
2962
|
+
*x_qh = tile_x_qh;
|
2963
|
+
*x_sc = tile_x_sc;
|
1463
2964
|
}
|
1464
2965
|
|
1465
|
-
static __device__ __forceinline__
|
1466
|
-
const void * __restrict__
|
2966
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
2967
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2968
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1467
2969
|
|
1468
|
-
|
1469
|
-
|
2970
|
+
__builtin_assume(i_offset >= 0);
|
2971
|
+
__builtin_assume(i_offset < 8);
|
2972
|
+
__builtin_assume(k >= 0);
|
2973
|
+
__builtin_assume(k < WARP_SIZE);
|
1470
2974
|
|
1471
|
-
const int
|
1472
|
-
const int
|
2975
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2976
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
1473
2977
|
|
1474
|
-
|
2978
|
+
const block_q6_K * bx0 = (block_q6_K *) vx;
|
1475
2979
|
|
1476
|
-
|
2980
|
+
#pragma unroll
|
2981
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2982
|
+
int i = i0 + i_offset;
|
1477
2983
|
|
1478
|
-
|
1479
|
-
|
2984
|
+
if (need_check) {
|
2985
|
+
i = min(i, i_max);
|
2986
|
+
}
|
1480
2987
|
|
1481
|
-
|
1482
|
-
memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
|
1483
|
-
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
|
1484
|
-
vh >>= bq8_offset;
|
2988
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
|
1485
2989
|
|
1486
|
-
|
1487
|
-
|
2990
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx);
|
2991
|
+
}
|
1488
2992
|
|
1489
|
-
|
1490
|
-
|
1491
|
-
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
|
2993
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
2994
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
1492
2995
|
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
2996
|
+
#pragma unroll
|
2997
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
|
2998
|
+
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
1496
2999
|
|
1497
|
-
|
3000
|
+
if (need_check) {
|
3001
|
+
i = min(i, i_max);
|
3002
|
+
}
|
1498
3003
|
|
1499
|
-
const
|
1500
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1501
|
-
const float d8i = bq8i->d;
|
3004
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
1502
3005
|
|
1503
|
-
|
3006
|
+
x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d;
|
3007
|
+
}
|
1504
3008
|
|
1505
|
-
|
3009
|
+
#pragma unroll
|
3010
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
|
3011
|
+
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
1506
3012
|
|
1507
|
-
|
3013
|
+
if (need_check) {
|
3014
|
+
i = min(i, i_max);
|
3015
|
+
}
|
3016
|
+
|
3017
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2);
|
1508
3018
|
|
1509
|
-
|
3019
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2));
|
1510
3020
|
}
|
1511
3021
|
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1516
|
-
}
|
3022
|
+
#pragma unroll
|
3023
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
3024
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
1517
3025
|
|
1518
|
-
|
1519
|
-
|
3026
|
+
if (need_check) {
|
3027
|
+
i = min(i, i_max);
|
3028
|
+
}
|
1520
3029
|
|
1521
|
-
|
1522
|
-
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
3030
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
|
1523
3031
|
|
1524
|
-
|
3032
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
|
3033
|
+
}
|
3034
|
+
}
|
1525
3035
|
|
1526
|
-
|
1527
|
-
|
3036
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
3037
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3038
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1528
3039
|
|
1529
|
-
|
1530
|
-
|
3040
|
+
__builtin_assume(i >= 0);
|
3041
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
3042
|
+
__builtin_assume(j >= 0);
|
3043
|
+
__builtin_assume(j < WARP_SIZE);
|
3044
|
+
__builtin_assume(k >= 0);
|
3045
|
+
__builtin_assume(k < WARP_SIZE);
|
1531
3046
|
|
1532
|
-
const int
|
3047
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3048
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
1533
3049
|
|
1534
|
-
|
1535
|
-
|
3050
|
+
const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
|
3051
|
+
const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
|
3052
|
+
const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
|
1536
3053
|
|
1537
|
-
|
1538
|
-
get_scale_min_k4(isc, bq4_K->scales, sc, m);
|
3054
|
+
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
|
1539
3055
|
|
1540
|
-
|
1541
|
-
|
1542
|
-
const float d8i = bq8i->d;
|
3056
|
+
const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
|
3057
|
+
const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
|
1543
3058
|
|
1544
|
-
|
3059
|
+
int u[QR6_K];
|
3060
|
+
float d8[QR6_K];
|
1545
3061
|
|
1546
|
-
|
1547
|
-
|
3062
|
+
for (int l = 0; l < QR6_K; ++l) {
|
3063
|
+
const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
|
3064
|
+
u[l] = y_qs[kqsy];
|
3065
|
+
d8[l] = y_ds[kqsy / QI8_1].x;
|
1548
3066
|
}
|
1549
3067
|
|
1550
|
-
return
|
1551
|
-
|
1552
|
-
return 0.0f; // only to satisfy the compiler
|
1553
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
3068
|
+
return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
|
3069
|
+
x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
|
1554
3070
|
}
|
1555
3071
|
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
const
|
3072
|
+
template <int qk, int qr, int qi, typename block_q_t,
|
3073
|
+
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3074
|
+
static __global__ void mul_mat_q(
|
3075
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3076
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
1561
3077
|
|
1562
|
-
const
|
3078
|
+
const block_q_t * x = (const block_q_t *) vx;
|
3079
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1563
3080
|
|
1564
|
-
|
1565
|
-
|
3081
|
+
const int blocks_per_row_x = ncols_x / qk;
|
3082
|
+
const int blocks_per_col_y = nrows_y / QK8_1;
|
3083
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
1566
3084
|
|
1567
|
-
const
|
1568
|
-
const float dmin = bq5_K->dmin;
|
3085
|
+
const int & ncols_dst = ncols_y;
|
1569
3086
|
|
1570
|
-
const int
|
3087
|
+
const int tid_x = threadIdx.x;
|
3088
|
+
const int tid_y = threadIdx.y;
|
1571
3089
|
|
1572
|
-
const int
|
3090
|
+
const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
|
3091
|
+
const int & row_x_0 = row_dst_0;
|
3092
|
+
const int row_dst = row_dst_0 + tid_x;
|
1573
3093
|
|
1574
|
-
|
1575
|
-
|
3094
|
+
const int col_dst_0 = blockIdx.y*WARP_SIZE;
|
3095
|
+
const int & col_y_0 = col_dst_0;
|
1576
3096
|
|
1577
|
-
|
1578
|
-
|
3097
|
+
int * tile_x_ql = nullptr;
|
3098
|
+
half2 * tile_x_dm = nullptr;
|
3099
|
+
int * tile_x_qh = nullptr;
|
3100
|
+
int * tile_x_sc = nullptr;
|
1579
3101
|
|
1580
|
-
|
1581
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1582
|
-
const float d8i = bq8i->d;
|
3102
|
+
allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
|
1583
3103
|
|
1584
|
-
|
3104
|
+
const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
|
1585
3105
|
|
1586
|
-
|
3106
|
+
__shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
|
3107
|
+
__shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
|
1587
3108
|
|
1588
|
-
|
3109
|
+
float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
|
1589
3110
|
|
1590
|
-
|
1591
|
-
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
|
1592
|
-
}
|
3111
|
+
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
1593
3112
|
|
1594
|
-
|
1595
|
-
|
1596
|
-
return 0.0f; // only to satisfy the compiler
|
1597
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1598
|
-
}
|
3113
|
+
load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
|
3114
|
+
tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
|
1599
3115
|
|
1600
|
-
|
1601
|
-
|
3116
|
+
for (int ir = 0; ir < qr; ++ir) {
|
3117
|
+
const int kqs = ir*WARP_SIZE + tid_x;
|
3118
|
+
const int kbxd = kqs / QI8_1;
|
1602
3119
|
|
1603
|
-
|
1604
|
-
|
3120
|
+
for (int i = 0; i < WARP_SIZE; i += 8) {
|
3121
|
+
const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
|
1605
3122
|
|
1606
|
-
|
1607
|
-
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
1608
|
-
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
3123
|
+
const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
|
1609
3124
|
|
1610
|
-
|
3125
|
+
tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
|
3126
|
+
}
|
3127
|
+
}
|
1611
3128
|
|
1612
|
-
|
3129
|
+
for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
|
3130
|
+
const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
|
3131
|
+
const int kby = tid_x % blocks_per_tile_y_col;
|
3132
|
+
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
3133
|
+
tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
|
3134
|
+
}
|
1613
3135
|
|
1614
|
-
|
1615
|
-
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
3136
|
+
__syncthreads();
|
1616
3137
|
|
1617
|
-
|
1618
|
-
|
3138
|
+
#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
|
3139
|
+
#pragma unroll
|
3140
|
+
#endif // __CUDA_ARCH__ >= 700
|
3141
|
+
for (int k = 0; k < WARP_SIZE; k += vdr) {
|
3142
|
+
#pragma unroll
|
3143
|
+
for (int j = 0; j < WARP_SIZE; j += 8) {
|
3144
|
+
#pragma unroll
|
3145
|
+
for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
|
3146
|
+
sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
|
3147
|
+
tid_x + i, tid_y + j, k);
|
3148
|
+
}
|
3149
|
+
}
|
3150
|
+
}
|
1619
3151
|
|
1620
|
-
|
1621
|
-
|
3152
|
+
__syncthreads();
|
3153
|
+
}
|
1622
3154
|
|
1623
|
-
const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
|
1624
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
1625
|
-
const float d8i = bq8i->d;
|
1626
3155
|
|
1627
|
-
|
3156
|
+
if (row_dst >= nrows_dst) {
|
3157
|
+
return;
|
3158
|
+
}
|
1628
3159
|
|
1629
|
-
|
3160
|
+
for (int j = 0; j < WARP_SIZE; j += 8) {
|
3161
|
+
const int col_dst = col_dst_0 + j + tid_y;
|
1630
3162
|
|
1631
|
-
|
3163
|
+
if (col_dst >= ncols_dst) {
|
3164
|
+
return;
|
3165
|
+
}
|
1632
3166
|
|
1633
|
-
|
3167
|
+
for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
|
3168
|
+
dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
|
3169
|
+
}
|
1634
3170
|
}
|
1635
|
-
|
1636
|
-
return d*sumf;
|
1637
|
-
#else
|
1638
|
-
return 0.0f; // only to satisfy the compiler
|
1639
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1640
3171
|
}
|
1641
3172
|
|
1642
|
-
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
3173
|
+
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1643
3174
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
1644
3175
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1645
3176
|
|
@@ -1648,7 +3179,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1648
3179
|
}
|
1649
3180
|
|
1650
3181
|
const int blocks_per_row = ncols / qk;
|
1651
|
-
const int blocks_per_warp = WARP_SIZE / qi;
|
3182
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
1652
3183
|
|
1653
3184
|
// partial sum for each thread
|
1654
3185
|
float tmp = 0.0f;
|
@@ -1657,11 +3188,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1657
3188
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1658
3189
|
|
1659
3190
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1660
|
-
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
3191
|
+
const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
|
1661
3192
|
|
1662
|
-
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
3193
|
+
const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
|
1663
3194
|
|
1664
|
-
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
3195
|
+
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
|
1665
3196
|
|
1666
3197
|
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
1667
3198
|
}
|
@@ -1694,11 +3225,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1694
3225
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
1695
3226
|
|
1696
3227
|
// partial sum for each thread
|
1697
|
-
#ifdef
|
3228
|
+
#ifdef GGML_CUDA_F16
|
1698
3229
|
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
1699
3230
|
#else
|
1700
3231
|
float tmp = 0.0f;
|
1701
|
-
#endif //
|
3232
|
+
#endif // GGML_CUDA_F16
|
1702
3233
|
|
1703
3234
|
for (int i = 0; i < ncols; i += iter_stride) {
|
1704
3235
|
const int col = i + vals_per_iter*tid;
|
@@ -1718,7 +3249,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1718
3249
|
|
1719
3250
|
// matrix multiplication
|
1720
3251
|
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
1721
|
-
#ifdef
|
3252
|
+
#ifdef GGML_CUDA_F16
|
1722
3253
|
tmp += __hmul2(v, {
|
1723
3254
|
y[iybs + iqs + j/qr + 0],
|
1724
3255
|
y[iybs + iqs + j/qr + y_offset]
|
@@ -1726,7 +3257,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1726
3257
|
#else
|
1727
3258
|
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
1728
3259
|
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
1729
|
-
#endif //
|
3260
|
+
#endif // GGML_CUDA_F16
|
1730
3261
|
}
|
1731
3262
|
}
|
1732
3263
|
|
@@ -1737,19 +3268,23 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1737
3268
|
}
|
1738
3269
|
|
1739
3270
|
if (tid == 0) {
|
1740
|
-
#ifdef
|
3271
|
+
#ifdef GGML_CUDA_F16
|
1741
3272
|
dst[row] = tmp.x + tmp.y;
|
1742
3273
|
#else
|
1743
3274
|
dst[row] = tmp;
|
1744
|
-
#endif //
|
3275
|
+
#endif // GGML_CUDA_F16
|
1745
3276
|
}
|
1746
3277
|
}
|
1747
3278
|
|
1748
|
-
static __global__ void mul_mat_p021_f16_f32(
|
3279
|
+
static __global__ void mul_mat_p021_f16_f32(
|
3280
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
3281
|
+
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
|
3282
|
+
|
1749
3283
|
const half * x = (const half *) vx;
|
1750
3284
|
|
1751
3285
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1752
3286
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
3287
|
+
const int channel_x = channel / (nchannels_y / nchannels_x);
|
1753
3288
|
|
1754
3289
|
const int nrows_y = ncols_x;
|
1755
3290
|
const int nrows_dst = nrows_x;
|
@@ -1765,7 +3300,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
1765
3300
|
}
|
1766
3301
|
|
1767
3302
|
// x is transposed and permuted
|
1768
|
-
const int ix = row_x*nchannels_x*ncols_x +
|
3303
|
+
const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
|
1769
3304
|
const float xi = __half2float(x[ix]);
|
1770
3305
|
|
1771
3306
|
const int row_y = col_x;
|
@@ -1793,12 +3328,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
1793
3328
|
|
1794
3329
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1795
3330
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1796
|
-
const int row_stride_x, const int channel_stride_x) {
|
3331
|
+
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
|
1797
3332
|
|
1798
3333
|
const half * x = (const half *) vx;
|
1799
3334
|
|
1800
3335
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1801
3336
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
3337
|
+
const int channel_x = channel / channel_x_divisor;
|
1802
3338
|
|
1803
3339
|
const int nrows_y = ncols_x;
|
1804
3340
|
const int nrows_dst = nrows_x;
|
@@ -1815,7 +3351,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1815
3351
|
break;
|
1816
3352
|
}
|
1817
3353
|
|
1818
|
-
const int ix =
|
3354
|
+
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
1819
3355
|
const float xi = __half2float(x[ix]);
|
1820
3356
|
|
1821
3357
|
const int row_y = col_x;
|
@@ -1876,7 +3412,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
1876
3412
|
}
|
1877
3413
|
|
1878
3414
|
// rope == RoPE == rotary positional embedding
|
1879
|
-
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float
|
3415
|
+
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
|
3416
|
+
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
1880
3417
|
const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
1881
3418
|
|
1882
3419
|
if (col >= ncols) {
|
@@ -1886,7 +3423,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
1886
3423
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1887
3424
|
const int i = row*ncols + col;
|
1888
3425
|
|
1889
|
-
const float theta =
|
3426
|
+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
1890
3427
|
const float sin_theta = sinf(theta);
|
1891
3428
|
const float cos_theta = cosf(theta);
|
1892
3429
|
|
@@ -2027,15 +3564,17 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
2027
3564
|
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
2028
3565
|
}
|
2029
3566
|
|
2030
|
-
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
3567
|
+
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
2031
3568
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
2032
3569
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2033
|
-
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
3570
|
+
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
2034
3571
|
}
|
2035
3572
|
|
2036
|
-
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int
|
2037
|
-
const int
|
2038
|
-
|
3573
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
3574
|
+
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
3575
|
+
const dim3 num_blocks(block_num_x, ky, 1);
|
3576
|
+
const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
|
3577
|
+
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
2039
3578
|
}
|
2040
3579
|
|
2041
3580
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -2196,7 +3735,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2196
3735
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2197
3736
|
const dim3 block_nums(1, block_num_y, 1);
|
2198
3737
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2199
|
-
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
3738
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
2200
3739
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2201
3740
|
}
|
2202
3741
|
|
@@ -2205,7 +3744,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2205
3744
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2206
3745
|
const dim3 block_nums(1, block_num_y, 1);
|
2207
3746
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2208
|
-
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
3747
|
+
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
2209
3748
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2210
3749
|
}
|
2211
3750
|
|
@@ -2214,7 +3753,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2214
3753
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2215
3754
|
const dim3 block_nums(1, block_num_y, 1);
|
2216
3755
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2217
|
-
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
3756
|
+
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
2218
3757
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2219
3758
|
}
|
2220
3759
|
|
@@ -2223,7 +3762,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2223
3762
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2224
3763
|
const dim3 block_nums(1, block_num_y, 1);
|
2225
3764
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2226
|
-
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
3765
|
+
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
2227
3766
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2228
3767
|
}
|
2229
3768
|
|
@@ -2232,7 +3771,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2232
3771
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2233
3772
|
const dim3 block_nums(1, block_num_y, 1);
|
2234
3773
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2235
|
-
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
3774
|
+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
2236
3775
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2237
3776
|
}
|
2238
3777
|
|
@@ -2241,7 +3780,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2241
3780
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2242
3781
|
const dim3 block_nums(1, block_num_y, 1);
|
2243
3782
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2244
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
3783
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1>
|
2245
3784
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2246
3785
|
}
|
2247
3786
|
|
@@ -2250,7 +3789,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2250
3789
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2251
3790
|
const dim3 block_nums(1, block_num_y, 1);
|
2252
3791
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2253
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
3792
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1>
|
2254
3793
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2255
3794
|
}
|
2256
3795
|
|
@@ -2259,7 +3798,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2259
3798
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2260
3799
|
const dim3 block_nums(1, block_num_y, 1);
|
2261
3800
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2262
|
-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
|
3801
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1>
|
2263
3802
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2264
3803
|
}
|
2265
3804
|
|
@@ -2268,7 +3807,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2268
3807
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2269
3808
|
const dim3 block_nums(1, block_num_y, 1);
|
2270
3809
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2271
|
-
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
|
3810
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1>
|
2272
3811
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2273
3812
|
}
|
2274
3813
|
|
@@ -2277,7 +3816,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2277
3816
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2278
3817
|
const dim3 block_nums(1, block_num_y, 1);
|
2279
3818
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2280
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
3819
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1>
|
2281
3820
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2282
3821
|
}
|
2283
3822
|
|
@@ -2324,20 +3863,203 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
2324
3863
|
}
|
2325
3864
|
}
|
2326
3865
|
|
2327
|
-
static void
|
2328
|
-
const
|
3866
|
+
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
3867
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3868
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3869
|
+
|
3870
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3871
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3872
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3873
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3874
|
+
|
3875
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3876
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3877
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3878
|
+
} else {
|
3879
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3880
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3881
|
+
}
|
3882
|
+
}
|
3883
|
+
|
3884
|
+
static void ggml_mul_mat_q4_1_q8_1_cuda(
|
3885
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3886
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3887
|
+
|
3888
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3889
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3890
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3891
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3892
|
+
|
3893
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3894
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3895
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3896
|
+
} else {
|
3897
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3898
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3899
|
+
}
|
3900
|
+
}
|
3901
|
+
|
3902
|
+
static void ggml_mul_mat_q5_0_q8_1_cuda(
|
3903
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3904
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3905
|
+
|
3906
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3907
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3908
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3909
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3910
|
+
|
3911
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3912
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3913
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3914
|
+
} else {
|
3915
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3916
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3917
|
+
}
|
3918
|
+
}
|
3919
|
+
|
3920
|
+
static void ggml_mul_mat_q5_1_q8_1_cuda(
|
3921
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3922
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3923
|
+
|
3924
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3925
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3926
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3927
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3928
|
+
|
3929
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3930
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3931
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3932
|
+
} else {
|
3933
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3934
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3935
|
+
}
|
3936
|
+
}
|
3937
|
+
|
3938
|
+
static void ggml_mul_mat_q8_0_q8_1_cuda(
|
3939
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3940
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3941
|
+
|
3942
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3943
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3944
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3945
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3946
|
+
|
3947
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3948
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3949
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3950
|
+
} else {
|
3951
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3952
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3953
|
+
}
|
3954
|
+
}
|
3955
|
+
|
3956
|
+
static void ggml_mul_mat_q2_K_q8_1_cuda(
|
3957
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3958
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3959
|
+
|
3960
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3961
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3962
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3963
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3964
|
+
|
3965
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3966
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
|
3967
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3968
|
+
} else {
|
3969
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
|
3970
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3971
|
+
}
|
3972
|
+
}
|
3973
|
+
|
3974
|
+
static void ggml_mul_mat_q3_K_q8_1_cuda(
|
3975
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3976
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3977
|
+
|
3978
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3979
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3980
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3981
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3982
|
+
|
3983
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3984
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
|
3985
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
|
+
} else {
|
3987
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
|
3988
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3989
|
+
}
|
3990
|
+
}
|
3991
|
+
|
3992
|
+
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
3993
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3994
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3995
|
+
|
3996
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3997
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3998
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3999
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
4000
|
+
|
4001
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
4002
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
|
4003
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4004
|
+
} else {
|
4005
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
|
4006
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4007
|
+
}
|
4008
|
+
}
|
4009
|
+
|
4010
|
+
static void ggml_mul_mat_q5_K_q8_1_cuda(
|
4011
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4012
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4013
|
+
|
4014
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
4015
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
4016
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4017
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
4018
|
+
|
4019
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
4020
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
|
4021
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4022
|
+
} else {
|
4023
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
|
4024
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4025
|
+
}
|
4026
|
+
}
|
4027
|
+
|
4028
|
+
static void ggml_mul_mat_q6_K_q8_1_cuda(
|
4029
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4030
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4031
|
+
|
4032
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
4033
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
4034
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4035
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
4036
|
+
|
4037
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
4038
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
|
4039
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4040
|
+
} else {
|
4041
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
|
4042
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4043
|
+
}
|
4044
|
+
}
|
4045
|
+
|
4046
|
+
static void ggml_mul_mat_p021_f16_f32_cuda(
|
4047
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
4048
|
+
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
4049
|
+
|
4050
|
+
const dim3 block_nums(1, nrows_x, nchannels_y);
|
2329
4051
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2330
|
-
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
|
4052
|
+
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
|
2331
4053
|
}
|
2332
4054
|
|
2333
4055
|
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
2334
4056
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
2335
|
-
const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
|
4057
|
+
const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
|
2336
4058
|
|
2337
|
-
const dim3 block_nums(1, nrows_x,
|
4059
|
+
const dim3 block_nums(1, nrows_x, nchannels_y);
|
2338
4060
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2339
4061
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
2340
|
-
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
|
4062
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
2341
4063
|
}
|
2342
4064
|
|
2343
4065
|
static void ggml_cpy_f32_f32_cuda(
|
@@ -2365,12 +4087,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
2365
4087
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
2366
4088
|
}
|
2367
4089
|
|
2368
|
-
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
4090
|
+
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4091
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
2369
4092
|
GGML_ASSERT(nrows % 2 == 0);
|
2370
4093
|
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
2371
4094
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
2372
4095
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
2373
|
-
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
4096
|
+
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
2374
4097
|
}
|
2375
4098
|
|
2376
4099
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
@@ -2499,6 +4222,7 @@ static int g_device_count = -1;
|
|
2499
4222
|
static int g_main_device = 0;
|
2500
4223
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
2501
4224
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
4225
|
+
static bool g_mul_mat_q = false;
|
2502
4226
|
|
2503
4227
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
2504
4228
|
|
@@ -2688,6 +4412,7 @@ inline void ggml_cuda_op_mul(
|
|
2688
4412
|
(void) dst;
|
2689
4413
|
(void) src0_ddq_i;
|
2690
4414
|
(void) i02;
|
4415
|
+
(void) i1;
|
2691
4416
|
}
|
2692
4417
|
|
2693
4418
|
inline void ggml_cuda_op_gelu(
|
@@ -2767,8 +4492,11 @@ inline void ggml_cuda_op_rms_norm(
|
|
2767
4492
|
const int64_t ne00 = src0->ne[0];
|
2768
4493
|
const int64_t i01_diff = i01_high - i01_low;
|
2769
4494
|
|
4495
|
+
float eps;
|
4496
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
4497
|
+
|
2770
4498
|
// compute
|
2771
|
-
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
4499
|
+
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
2772
4500
|
|
2773
4501
|
(void) src1;
|
2774
4502
|
(void) dst;
|
@@ -2778,6 +4506,83 @@ inline void ggml_cuda_op_rms_norm(
|
|
2778
4506
|
(void) i1;
|
2779
4507
|
}
|
2780
4508
|
|
4509
|
+
inline void ggml_cuda_op_mul_mat_q(
|
4510
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
4511
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
4512
|
+
cudaStream_t & cudaStream_main){
|
4513
|
+
|
4514
|
+
GGML_ASSERT(src0_ddq_i != nullptr);
|
4515
|
+
GGML_ASSERT(src1_ddf_i != nullptr);
|
4516
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
4517
|
+
|
4518
|
+
const int64_t ne00 = src0->ne[0];
|
4519
|
+
|
4520
|
+
const int64_t ne10 = src1->ne[0];
|
4521
|
+
const int64_t ne11 = src1->ne[1];
|
4522
|
+
GGML_ASSERT(ne10 % QK8_1 == 0);
|
4523
|
+
|
4524
|
+
const int64_t ne0 = dst->ne[0];
|
4525
|
+
|
4526
|
+
const int64_t i01_diff = i01_high - i01_low;
|
4527
|
+
|
4528
|
+
int id;
|
4529
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4530
|
+
|
4531
|
+
// the main device has a larger memory buffer to hold the results from all GPUs
|
4532
|
+
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
4533
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
|
4534
|
+
|
4535
|
+
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
4536
|
+
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
4537
|
+
size_t as;
|
4538
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
|
4539
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
|
4540
|
+
|
4541
|
+
switch (src0->type) {
|
4542
|
+
case GGML_TYPE_Q4_0:
|
4543
|
+
ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4544
|
+
break;
|
4545
|
+
case GGML_TYPE_Q4_1:
|
4546
|
+
ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4547
|
+
break;
|
4548
|
+
case GGML_TYPE_Q5_0:
|
4549
|
+
ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4550
|
+
break;
|
4551
|
+
case GGML_TYPE_Q5_1:
|
4552
|
+
ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4553
|
+
break;
|
4554
|
+
case GGML_TYPE_Q8_0:
|
4555
|
+
ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4556
|
+
break;
|
4557
|
+
case GGML_TYPE_Q2_K:
|
4558
|
+
ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4559
|
+
break;
|
4560
|
+
case GGML_TYPE_Q3_K:
|
4561
|
+
ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4562
|
+
break;
|
4563
|
+
case GGML_TYPE_Q4_K:
|
4564
|
+
ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4565
|
+
break;
|
4566
|
+
case GGML_TYPE_Q5_K:
|
4567
|
+
ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4568
|
+
break;
|
4569
|
+
case GGML_TYPE_Q6_K:
|
4570
|
+
ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4571
|
+
break;
|
4572
|
+
default:
|
4573
|
+
GGML_ASSERT(false);
|
4574
|
+
break;
|
4575
|
+
}
|
4576
|
+
|
4577
|
+
ggml_cuda_pool_free(src1_q8_1, as);
|
4578
|
+
|
4579
|
+
(void) src1;
|
4580
|
+
(void) dst;
|
4581
|
+
(void) src0_ddf_i;
|
4582
|
+
(void) i02;
|
4583
|
+
(void) i1;
|
4584
|
+
}
|
4585
|
+
|
2781
4586
|
inline void ggml_cuda_op_mul_mat_vec(
|
2782
4587
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2783
4588
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2792,6 +4597,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2792
4597
|
|
2793
4598
|
#ifdef GGML_CUDA_FORCE_DMMV
|
2794
4599
|
const bool use_mul_mat_vec_q = false;
|
4600
|
+
(void) g_compute_capabilities[0];
|
2795
4601
|
#else
|
2796
4602
|
int id;
|
2797
4603
|
CUDA_CHECK(cudaGetDevice(&id));
|
@@ -2815,11 +4621,11 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2815
4621
|
#endif
|
2816
4622
|
|
2817
4623
|
if (use_mul_mat_vec_q) {
|
2818
|
-
int64_t padded_row_size = ne00
|
2819
|
-
|
4624
|
+
const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
|
4625
|
+
ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
2820
4626
|
size_t as;
|
2821
4627
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2822
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
4628
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
|
2823
4629
|
|
2824
4630
|
switch (src0->type) {
|
2825
4631
|
case GGML_TYPE_Q4_0:
|
@@ -2860,7 +4666,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2860
4666
|
ggml_cuda_pool_free(src1_q8_1, as);
|
2861
4667
|
} else {
|
2862
4668
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
2863
|
-
#ifdef
|
4669
|
+
#ifdef GGML_CUDA_F16
|
2864
4670
|
size_t ash;
|
2865
4671
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
2866
4672
|
|
@@ -2876,7 +4682,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2876
4682
|
}
|
2877
4683
|
#else
|
2878
4684
|
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
2879
|
-
#endif //
|
4685
|
+
#endif // GGML_CUDA_F16
|
2880
4686
|
|
2881
4687
|
switch (src0->type) {
|
2882
4688
|
case GGML_TYPE_Q4_0:
|
@@ -2917,11 +4723,11 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2917
4723
|
break;
|
2918
4724
|
}
|
2919
4725
|
|
2920
|
-
#ifdef
|
4726
|
+
#ifdef GGML_CUDA_F16
|
2921
4727
|
if (src1_convert_f16) {
|
2922
4728
|
ggml_cuda_pool_free(src1_dfloat, ash);
|
2923
4729
|
}
|
2924
|
-
#endif //
|
4730
|
+
#endif // GGML_CUDA_F16
|
2925
4731
|
}
|
2926
4732
|
|
2927
4733
|
(void) src1;
|
@@ -2981,32 +4787,35 @@ inline void ggml_cuda_op_rope(
|
|
2981
4787
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
2982
4788
|
|
2983
4789
|
const int64_t ne00 = src0->ne[0];
|
4790
|
+
const int64_t ne01 = src0->ne[1];
|
2984
4791
|
const int64_t i01_diff = i01_high - i01_low;
|
2985
4792
|
|
2986
|
-
const int n_past = ((int32_t *)
|
2987
|
-
const int n_dims = ((int32_t *)
|
2988
|
-
const int mode = ((int32_t *)
|
2989
|
-
const int n_ctx = ((int32_t *)
|
2990
|
-
|
4793
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
4794
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
4795
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
4796
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
2991
4797
|
// RoPE alteration for extended context
|
4798
|
+
|
2992
4799
|
float freq_base, freq_scale;
|
2993
|
-
memcpy(&freq_base, (int32_t *)
|
2994
|
-
memcpy(&freq_scale, (int32_t *)
|
4800
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
4801
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
2995
4802
|
|
2996
4803
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
2997
|
-
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
2998
4804
|
|
2999
|
-
bool is_glm = mode & 4;
|
4805
|
+
const bool is_glm = mode & 4;
|
3000
4806
|
|
3001
4807
|
// compute
|
3002
4808
|
if (is_glm) {
|
4809
|
+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
3003
4810
|
const float id_p = min(p, n_ctx - 2.f);
|
3004
4811
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
3005
4812
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
3006
4813
|
} else {
|
3007
|
-
|
4814
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
4815
|
+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
3008
4816
|
}
|
3009
4817
|
|
4818
|
+
(void) src1;
|
3010
4819
|
(void) dst;
|
3011
4820
|
(void) src0_ddq_i;
|
3012
4821
|
(void) src1_ddf_i;
|
@@ -3025,11 +4834,12 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
3025
4834
|
const int64_t ne01 = src0->ne[1];
|
3026
4835
|
const int64_t i01_diff = i01_high - i01_low;
|
3027
4836
|
|
3028
|
-
const int n_past = ((int32_t *)
|
4837
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
3029
4838
|
|
3030
4839
|
// compute
|
3031
4840
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
3032
4841
|
|
4842
|
+
(void) src1;
|
3033
4843
|
(void) dst;
|
3034
4844
|
(void) src0_ddq_i;
|
3035
4845
|
(void) src1_ddf_i;
|
@@ -3097,6 +4907,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3097
4907
|
const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
|
3098
4908
|
const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
|
3099
4909
|
const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
|
4910
|
+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
4911
|
+
|
4912
|
+
GGML_ASSERT(ne03 == ne13);
|
3100
4913
|
|
3101
4914
|
const int64_t ne0 = dst->ne[0];
|
3102
4915
|
const int64_t ne1 = dst->ne[1];
|
@@ -3108,12 +4921,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3108
4921
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
3109
4922
|
|
3110
4923
|
// strides for iteration over dims 3 and 2
|
3111
|
-
const int64_t
|
3112
|
-
const int64_t
|
4924
|
+
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
4925
|
+
const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
4926
|
+
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
3113
4927
|
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
3114
4928
|
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
3115
4929
|
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
3116
4930
|
|
4931
|
+
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
4932
|
+
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
4933
|
+
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
4934
|
+
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
4935
|
+
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
4936
|
+
|
3117
4937
|
const size_t src0_ts = ggml_type_size(src0->type);
|
3118
4938
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
3119
4939
|
|
@@ -3130,6 +4950,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3130
4950
|
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
3131
4951
|
|
3132
4952
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
4953
|
+
GGML_ASSERT(!(split && ne02 < ne12));
|
3133
4954
|
|
3134
4955
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
3135
4956
|
|
@@ -3163,10 +4984,17 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3163
4984
|
int64_t row_low, row_high;
|
3164
4985
|
if (split) {
|
3165
4986
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
3166
|
-
|
4987
|
+
row_low -= row_low % GGML_CUDA_MMQ_Y;
|
4988
|
+
|
4989
|
+
if (id == g_device_count - 1) {
|
4990
|
+
row_high = nrows0;
|
4991
|
+
} else {
|
4992
|
+
row_high = nrows0*g_tensor_split[id + 1];
|
4993
|
+
row_high -= row_high % GGML_CUDA_MMQ_Y;
|
4994
|
+
}
|
3167
4995
|
} else {
|
3168
4996
|
row_low = 0;
|
3169
|
-
row_high = nrows0;
|
4997
|
+
row_high = nrows0*i02_divisor;
|
3170
4998
|
}
|
3171
4999
|
if (row_low == row_high) {
|
3172
5000
|
continue;
|
@@ -3214,16 +5042,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3214
5042
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
3215
5043
|
}
|
3216
5044
|
|
3217
|
-
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
3218
|
-
const int64_t i02_max = flatten_rows ? 1 : ne02;
|
3219
|
-
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
3220
|
-
|
3221
5045
|
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
3222
5046
|
const int64_t i13 = i03 % ne13;
|
3223
5047
|
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
3224
5048
|
const int64_t i12 = i02 % ne12;
|
3225
5049
|
|
3226
|
-
const int64_t i0 = i03*
|
5050
|
+
const int64_t i0 = i03*i02_max + i02;
|
3227
5051
|
|
3228
5052
|
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
3229
5053
|
const int64_t i0_offset_low = row_low/rows_per_iter;
|
@@ -3257,10 +5081,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3257
5081
|
const int64_t i11 = i13*ne12 + i12;
|
3258
5082
|
|
3259
5083
|
// for split tensors the data begins at i0 == i0_offset_low
|
3260
|
-
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
3261
|
-
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
5084
|
+
char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
5085
|
+
float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
|
3262
5086
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
3263
|
-
float * dst_ddf_i = dst_ddf[id] + (i0
|
5087
|
+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
3264
5088
|
|
3265
5089
|
// for split tensors the data pointer needs to be rounded down
|
3266
5090
|
// to the bin edge for i03, i02 bins beyond the first
|
@@ -3299,11 +5123,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3299
5123
|
}
|
3300
5124
|
}
|
3301
5125
|
|
3302
|
-
if (!src0_on_device || !src0_is_contiguous) {
|
5126
|
+
if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
3303
5127
|
if (src0_is_f32) {
|
3304
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
5128
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
3305
5129
|
} else {
|
3306
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
5130
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
3307
5131
|
}
|
3308
5132
|
}
|
3309
5133
|
|
@@ -3333,13 +5157,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3333
5157
|
if (split) {
|
3334
5158
|
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
3335
5159
|
// dst is NOT transposed.
|
3336
|
-
// The outputs of
|
5160
|
+
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
3337
5161
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
3338
5162
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
3339
|
-
|
3340
|
-
|
3341
|
-
|
3342
|
-
}
|
5163
|
+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
|
5164
|
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
|
5165
|
+
i01_diff*sizeof(float), ne1, kind, cudaStream_main));
|
3343
5166
|
} else {
|
3344
5167
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
3345
5168
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
@@ -3457,6 +5280,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
3457
5280
|
const int64_t ne01 = src0->ne[1];
|
3458
5281
|
const int64_t ne02 = src0->ne[2];
|
3459
5282
|
|
5283
|
+
const int64_t ne12 = src1->ne[2];
|
5284
|
+
|
3460
5285
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3461
5286
|
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
3462
5287
|
|
@@ -3469,7 +5294,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
3469
5294
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
3470
5295
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
3471
5296
|
|
3472
|
-
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
5297
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
|
3473
5298
|
}
|
3474
5299
|
|
3475
5300
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -3483,6 +5308,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
3483
5308
|
const int64_t ne01 = src0->ne[1];
|
3484
5309
|
const int64_t ne02 = src0->ne[2];
|
3485
5310
|
|
5311
|
+
const int64_t ne12 = src1->ne[2];
|
5312
|
+
|
3486
5313
|
const int64_t nb01 = src0->nb[1];
|
3487
5314
|
const int64_t nb02 = src0->nb[2];
|
3488
5315
|
|
@@ -3501,7 +5328,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
3501
5328
|
const int row_stride_x = nb01 / sizeof(half);
|
3502
5329
|
const int channel_stride_x = nb02 / sizeof(half);
|
3503
5330
|
|
3504
|
-
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
5331
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
|
3505
5332
|
}
|
3506
5333
|
|
3507
5334
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3518,7 +5345,18 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
3518
5345
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
3519
5346
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
3520
5347
|
} else {
|
3521
|
-
|
5348
|
+
int min_compute_capability = INT_MAX;
|
5349
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5350
|
+
if (min_compute_capability > g_compute_capabilities[id]) {
|
5351
|
+
min_compute_capability = g_compute_capabilities[id];
|
5352
|
+
}
|
5353
|
+
}
|
5354
|
+
|
5355
|
+
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
5356
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
|
5357
|
+
} else {
|
5358
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
5359
|
+
}
|
3522
5360
|
}
|
3523
5361
|
} else {
|
3524
5362
|
GGML_ASSERT(false);
|
@@ -3595,7 +5433,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
|
3595
5433
|
|
3596
5434
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3597
5435
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3598
|
-
|
5436
|
+
|
5437
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
5438
|
+
const bool is_glm = mode & 4;
|
5439
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
3599
5440
|
}
|
3600
5441
|
|
3601
5442
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3628,7 +5469,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3628
5469
|
row_high = nrows;
|
3629
5470
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
3630
5471
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
3631
|
-
|
5472
|
+
row_low -= row_low % GGML_CUDA_MMQ_Y;
|
5473
|
+
|
5474
|
+
if (id == g_device_count - 1) {
|
5475
|
+
row_high = nrows;
|
5476
|
+
} else {
|
5477
|
+
row_high = nrows*g_tensor_split[id + 1];
|
5478
|
+
row_high -= row_high % GGML_CUDA_MMQ_Y;
|
5479
|
+
}
|
3632
5480
|
} else {
|
3633
5481
|
GGML_ASSERT(false);
|
3634
5482
|
}
|
@@ -3642,7 +5490,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3642
5490
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3643
5491
|
const size_t original_size = size;
|
3644
5492
|
|
3645
|
-
// pad last row to a multiple of
|
5493
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
3646
5494
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3647
5495
|
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3648
5496
|
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
@@ -3658,7 +5506,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3658
5506
|
}
|
3659
5507
|
|
3660
5508
|
|
3661
|
-
CUDA_CHECK(cudaMemcpy(buf, buf_host,
|
5509
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
3662
5510
|
|
3663
5511
|
extra->data_device[id] = buf;
|
3664
5512
|
|
@@ -3738,7 +5586,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3738
5586
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
3739
5587
|
size_t offset = 0;
|
3740
5588
|
if (tensor->op == GGML_OP_VIEW) {
|
3741
|
-
memcpy(&offset, tensor->
|
5589
|
+
memcpy(&offset, tensor->op_params, sizeof(size_t));
|
3742
5590
|
}
|
3743
5591
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3744
5592
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
@@ -3802,6 +5650,10 @@ void ggml_cuda_set_main_device(int main_device) {
|
|
3802
5650
|
}
|
3803
5651
|
}
|
3804
5652
|
|
5653
|
+
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
|
5654
|
+
g_mul_mat_q = mul_mat_q;
|
5655
|
+
}
|
5656
|
+
|
3805
5657
|
void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
3806
5658
|
g_scratch_size = scratch_size;
|
3807
5659
|
}
|
@@ -3840,18 +5692,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3840
5692
|
}
|
3841
5693
|
func = ggml_cuda_mul;
|
3842
5694
|
break;
|
3843
|
-
case
|
3844
|
-
|
3845
|
-
|
3846
|
-
|
3847
|
-
|
3848
|
-
|
3849
|
-
|
3850
|
-
|
3851
|
-
|
3852
|
-
|
3853
|
-
|
3854
|
-
|
5695
|
+
case GGML_OP_UNARY:
|
5696
|
+
switch (ggml_get_unary_op(tensor)) {
|
5697
|
+
case GGML_UNARY_OP_GELU:
|
5698
|
+
if (!any_on_device) {
|
5699
|
+
return false;
|
5700
|
+
}
|
5701
|
+
func = ggml_cuda_gelu;
|
5702
|
+
break;
|
5703
|
+
case GGML_UNARY_OP_SILU:
|
5704
|
+
if (!any_on_device) {
|
5705
|
+
return false;
|
5706
|
+
}
|
5707
|
+
func = ggml_cuda_silu;
|
5708
|
+
break;
|
5709
|
+
default:
|
5710
|
+
return false;
|
5711
|
+
} break;
|
3855
5712
|
case GGML_OP_NORM:
|
3856
5713
|
if (!any_on_device) {
|
3857
5714
|
return false;
|