llama_cpp 0.3.5 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2090 -438
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +17 -16
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +49 -26
- data/ext/llama_cpp/src/ggml.h +12 -1
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama.cpp +199 -68
- data/ext/llama_cpp/src/llama.h +1 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
@@ -52,13 +52,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
52
52
|
} while (0)
|
53
53
|
#endif // CUDART_VERSION >= 11
|
54
54
|
|
55
|
-
#ifdef
|
55
|
+
#ifdef GGML_CUDA_F16
|
56
56
|
typedef half dfloat; // dequantize float
|
57
57
|
typedef half2 dfloat2;
|
58
58
|
#else
|
59
59
|
typedef float dfloat; // dequantize float
|
60
60
|
typedef float2 dfloat2;
|
61
|
-
#endif //
|
61
|
+
#endif //GGML_CUDA_F16
|
62
|
+
|
63
|
+
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
64
|
+
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
65
|
+
|
66
|
+
int x32 = 0;
|
67
|
+
x32 |= x16[0] << 0;
|
68
|
+
x32 |= x16[1] << 16;
|
69
|
+
|
70
|
+
return x32;
|
71
|
+
}
|
72
|
+
|
73
|
+
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
74
|
+
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
75
|
+
|
76
|
+
int x32 = 0;
|
77
|
+
x32 |= x16[0] << 0;
|
78
|
+
x32 |= x16[1] << 16;
|
79
|
+
|
80
|
+
return x32;
|
81
|
+
}
|
82
|
+
|
83
|
+
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
84
|
+
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
85
|
+
}
|
86
|
+
|
87
|
+
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
88
|
+
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
89
|
+
}
|
62
90
|
|
63
91
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
64
92
|
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
@@ -87,8 +115,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
87
115
|
#define QR4_1 2
|
88
116
|
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
89
117
|
typedef struct {
|
90
|
-
|
91
|
-
half m; // min
|
118
|
+
half2 dm; // dm.x = delta, dm.y = min
|
92
119
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
93
120
|
} block_q4_1;
|
94
121
|
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
@@ -107,8 +134,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
107
134
|
#define QR5_1 2
|
108
135
|
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
109
136
|
typedef struct {
|
110
|
-
|
111
|
-
half m; // min
|
137
|
+
half2 dm; // dm.x = delta, dm.y = min
|
112
138
|
uint8_t qh[4]; // 5-th bit of quants
|
113
139
|
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
114
140
|
} block_q5_1;
|
@@ -127,13 +153,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
127
153
|
#define QR8_1 1
|
128
154
|
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
129
155
|
typedef struct {
|
130
|
-
|
131
|
-
half s; // unquantized sum
|
156
|
+
half2 ds; // ds.x = delta, ds.y = sum
|
132
157
|
int8_t qs[QK8_0]; // quants
|
133
158
|
} block_q8_1;
|
134
159
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
135
160
|
|
136
|
-
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
161
|
+
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
162
|
+
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
|
163
|
+
typedef void (*load_tiles_cuda_t)(
|
164
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
165
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
|
166
|
+
typedef float (*vec_dot_q_mul_mat_cuda_t)(
|
167
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
168
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
|
137
169
|
|
138
170
|
//================================= k-quants
|
139
171
|
|
@@ -150,8 +182,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
|
150
182
|
typedef struct {
|
151
183
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
152
184
|
uint8_t qs[QK_K/4]; // quants
|
153
|
-
|
154
|
-
half dmin; // super-block scale for quantized mins
|
185
|
+
half2 dm; // super-block scale for quantized scales/mins
|
155
186
|
} block_q2_K;
|
156
187
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
157
188
|
|
@@ -180,8 +211,7 @@ typedef struct {
|
|
180
211
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
181
212
|
#else
|
182
213
|
typedef struct {
|
183
|
-
|
184
|
-
half dmin; // super-block scale for quantized mins
|
214
|
+
half2 dm; // super-block scale for quantized scales/mins
|
185
215
|
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
186
216
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
187
217
|
} block_q4_K;
|
@@ -200,11 +230,10 @@ typedef struct {
|
|
200
230
|
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
201
231
|
#else
|
202
232
|
typedef struct {
|
203
|
-
|
204
|
-
|
205
|
-
uint8_t
|
206
|
-
uint8_t
|
207
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
233
|
+
half2 dm; // super-block scale for quantized scales/mins
|
234
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
235
|
+
uint8_t qh[QK_K/8]; // quants, high bit
|
236
|
+
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
208
237
|
} block_q5_K;
|
209
238
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
210
239
|
#endif
|
@@ -233,6 +262,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
233
262
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
234
263
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
235
264
|
|
265
|
+
#ifndef GGML_CUDA_MMQ_Y
|
266
|
+
#define GGML_CUDA_MMQ_Y 64
|
267
|
+
#endif // GGML_CUDA_MMQ_Y
|
268
|
+
|
236
269
|
// dmmv = dequantize_mul_mat_vec
|
237
270
|
#ifndef GGML_CUDA_DMMV_X
|
238
271
|
#define GGML_CUDA_DMMV_X 32
|
@@ -367,33 +400,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
367
400
|
v.x = vui & 0xF;
|
368
401
|
v.y = vui >> 4;
|
369
402
|
|
370
|
-
#ifdef
|
403
|
+
#ifdef GGML_CUDA_F16
|
371
404
|
v = __hsub2(v, {8.0f, 8.0f});
|
372
405
|
v = __hmul2(v, {d, d});
|
373
406
|
#else
|
374
407
|
v.x = (v.x - 8.0f) * d;
|
375
408
|
v.y = (v.y - 8.0f) * d;
|
376
|
-
#endif //
|
409
|
+
#endif // GGML_CUDA_F16
|
377
410
|
}
|
378
411
|
|
379
412
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
380
413
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
381
414
|
|
382
|
-
const dfloat d = x[ib].
|
383
|
-
const dfloat m = x[ib].
|
415
|
+
const dfloat d = x[ib].dm.x;
|
416
|
+
const dfloat m = x[ib].dm.y;
|
384
417
|
|
385
418
|
const int vui = x[ib].qs[iqs];
|
386
419
|
|
387
420
|
v.x = vui & 0xF;
|
388
421
|
v.y = vui >> 4;
|
389
422
|
|
390
|
-
#ifdef
|
423
|
+
#ifdef GGML_CUDA_F16
|
391
424
|
v = __hmul2(v, {d, d});
|
392
425
|
v = __hadd2(v, {m, m});
|
393
426
|
#else
|
394
427
|
v.x = (v.x * d) + m;
|
395
428
|
v.y = (v.y * d) + m;
|
396
|
-
#endif //
|
429
|
+
#endif // GGML_CUDA_F16
|
397
430
|
}
|
398
431
|
|
399
432
|
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
@@ -410,20 +443,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
410
443
|
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
411
444
|
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
412
445
|
|
413
|
-
#ifdef
|
446
|
+
#ifdef GGML_CUDA_F16
|
414
447
|
v = __hsub2(v, {16.0f, 16.0f});
|
415
448
|
v = __hmul2(v, {d, d});
|
416
449
|
#else
|
417
450
|
v.x = (v.x - 16.0f) * d;
|
418
451
|
v.y = (v.y - 16.0f) * d;
|
419
|
-
#endif //
|
452
|
+
#endif // GGML_CUDA_F16
|
420
453
|
}
|
421
454
|
|
422
455
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
423
456
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
424
457
|
|
425
|
-
const dfloat d = x[ib].
|
426
|
-
const dfloat m = x[ib].
|
458
|
+
const dfloat d = x[ib].dm.x;
|
459
|
+
const dfloat m = x[ib].dm.y;
|
427
460
|
|
428
461
|
uint32_t qh;
|
429
462
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
@@ -434,13 +467,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
|
434
467
|
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
435
468
|
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
436
469
|
|
437
|
-
#ifdef
|
470
|
+
#ifdef GGML_CUDA_F16
|
438
471
|
v = __hmul2(v, {d, d});
|
439
472
|
v = __hadd2(v, {m, m});
|
440
473
|
#else
|
441
474
|
v.x = (v.x * d) + m;
|
442
475
|
v.y = (v.y * d) + m;
|
443
|
-
#endif //
|
476
|
+
#endif // GGML_CUDA_F16
|
444
477
|
}
|
445
478
|
|
446
479
|
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
@@ -451,12 +484,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
451
484
|
v.x = x[ib].qs[iqs + 0];
|
452
485
|
v.y = x[ib].qs[iqs + 1];
|
453
486
|
|
454
|
-
#ifdef
|
487
|
+
#ifdef GGML_CUDA_F16
|
455
488
|
v = __hmul2(v, {d, d});
|
456
489
|
#else
|
457
490
|
v.x *= d;
|
458
491
|
v.y *= d;
|
459
|
-
#endif //
|
492
|
+
#endif // GGML_CUDA_F16
|
460
493
|
}
|
461
494
|
|
462
495
|
//================================== k-quants
|
@@ -475,8 +508,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
475
508
|
const uint8_t q = x[i].qs[32*n + l];
|
476
509
|
float * y = yy + i*QK_K + 128*n;
|
477
510
|
|
478
|
-
float dall = x[i].
|
479
|
-
float dmin = x[i].
|
511
|
+
float dall = x[i].dm.x;
|
512
|
+
float dmin = x[i].dm.y;
|
480
513
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
481
514
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
482
515
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
@@ -486,8 +519,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
486
519
|
const int il = tid%16; // 0...15
|
487
520
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
488
521
|
float * y = yy + i*QK_K + 16*is + il;
|
489
|
-
float dall = x[i].
|
490
|
-
float dmin = x[i].
|
522
|
+
float dall = x[i].dm.x;
|
523
|
+
float dmin = x[i].dm.y;
|
491
524
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
492
525
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
493
526
|
#endif
|
@@ -573,8 +606,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
573
606
|
|
574
607
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
575
608
|
|
576
|
-
const float dall = x[i].
|
577
|
-
const float dmin = x[i].
|
609
|
+
const float dall = x[i].dm.x;
|
610
|
+
const float dmin = x[i].dm.y;
|
578
611
|
|
579
612
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
580
613
|
|
@@ -612,8 +645,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
612
645
|
|
613
646
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
614
647
|
|
615
|
-
const float dall = x[i].
|
616
|
-
const float dmin = x[i].
|
648
|
+
const float dall = x[i].dm.x;
|
649
|
+
const float dmin = x[i].dm.y;
|
617
650
|
|
618
651
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
619
652
|
const uint8_t * qh = x[i].qh + 2*ir;
|
@@ -725,8 +758,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
725
758
|
const float * y = yy + i * QK_K + y_offset;
|
726
759
|
const uint8_t * q = x[i].qs + q_offset;
|
727
760
|
|
728
|
-
const float dall = x[i].
|
729
|
-
const float dmin = x[i].
|
761
|
+
const float dall = x[i].dm.x;
|
762
|
+
const float dmin = x[i].dm.y;
|
730
763
|
|
731
764
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
732
765
|
aux[0] = a[0] & 0x0f0f0f0f;
|
@@ -768,9 +801,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
768
801
|
uaux[0] = s[0] & 0x0f0f0f0f;
|
769
802
|
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
770
803
|
|
771
|
-
const
|
772
|
-
|
773
|
-
const float2 dall = __half22float2(dh[0]);
|
804
|
+
const float2 dall = __half22float2(x[i].dm);
|
774
805
|
|
775
806
|
float sum1 = 0, sum2 = 0;
|
776
807
|
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
@@ -948,8 +979,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
948
979
|
const float * y1 = yy + i*QK_K + y_offset;
|
949
980
|
const float * y2 = y1 + 128;
|
950
981
|
|
951
|
-
const float dall = x[i].
|
952
|
-
const float dmin = x[i].
|
982
|
+
const float dall = x[i].dm.x;
|
983
|
+
const float dmin = x[i].dm.y;
|
953
984
|
|
954
985
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
955
986
|
aux[0] = a[im+0] & kmask1;
|
@@ -1081,8 +1112,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1081
1112
|
const float * y1 = yy + i*QK_K + y_offset;
|
1082
1113
|
const float * y2 = y1 + 128;
|
1083
1114
|
|
1084
|
-
const float dall = x[i].
|
1085
|
-
const float dmin = x[i].
|
1115
|
+
const float dall = x[i].dm.x;
|
1116
|
+
const float dmin = x[i].dm.y;
|
1086
1117
|
|
1087
1118
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1088
1119
|
aux[0] = a[im+0] & kmask1;
|
@@ -1270,19 +1301,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1270
1301
|
v.y = x[ib + iqs + 1];
|
1271
1302
|
}
|
1272
1303
|
|
1273
|
-
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int
|
1274
|
-
const int
|
1304
|
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
1305
|
+
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
1275
1306
|
|
1276
|
-
if (
|
1307
|
+
if (ix >= kx_padded) {
|
1277
1308
|
return;
|
1278
1309
|
}
|
1279
1310
|
|
1311
|
+
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
|
1312
|
+
|
1313
|
+
const int i_padded = iy*kx_padded + ix;
|
1314
|
+
|
1280
1315
|
block_q8_1 * y = (block_q8_1 *) vy;
|
1281
1316
|
|
1282
|
-
const int ib =
|
1283
|
-
const int iqs =
|
1317
|
+
const int ib = i_padded / QK8_1; // block index
|
1318
|
+
const int iqs = i_padded % QK8_1; // quant index
|
1284
1319
|
|
1285
|
-
const float xi =
|
1320
|
+
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
1286
1321
|
float amax = fabsf(xi);
|
1287
1322
|
float sum = xi;
|
1288
1323
|
|
@@ -1301,8 +1336,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1301
1336
|
return;
|
1302
1337
|
}
|
1303
1338
|
|
1304
|
-
y[ib].
|
1305
|
-
y[ib].
|
1339
|
+
y[ib].ds.x = d;
|
1340
|
+
y[ib].ds.y = sum;
|
1306
1341
|
}
|
1307
1342
|
|
1308
1343
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -1326,485 +1361,1816 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
|
|
1326
1361
|
y[iybs + iqs + y_offset] = v.y;
|
1327
1362
|
}
|
1328
1363
|
|
1329
|
-
|
1330
|
-
|
1331
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1332
|
-
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1364
|
+
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
1365
|
+
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
1333
1366
|
|
1334
|
-
|
1335
|
-
|
1336
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1337
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
1367
|
+
#define VDR_Q4_0_Q8_1_MMVQ 2
|
1368
|
+
#define VDR_Q4_0_Q8_1_MMQ 4
|
1338
1369
|
|
1339
|
-
|
1370
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
|
1371
|
+
const int * v, const int * u, const float & d4, const half2 & ds8) {
|
1372
|
+
|
1373
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1374
|
+
int sumi = 0;
|
1340
1375
|
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1376
|
+
#pragma unroll
|
1377
|
+
for (int i = 0; i < vdr; ++i) {
|
1378
|
+
const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
|
1379
|
+
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
1344
1380
|
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1381
|
+
// SIMD dot product of quantized values
|
1382
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi);
|
1383
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1384
|
+
}
|
1348
1385
|
|
1349
|
-
|
1386
|
+
// second part effectively subtracts 8 from each quant value
|
1387
|
+
return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y));
|
1350
1388
|
#else
|
1351
1389
|
return 0.0f; // only to satisfy the compiler
|
1352
1390
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1353
1391
|
}
|
1354
1392
|
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1393
|
+
#define VDR_Q4_1_Q8_1_MMVQ 2
|
1394
|
+
#define VDR_Q4_1_Q8_1_MMQ 4
|
1395
|
+
|
1396
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
|
1397
|
+
const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
|
1359
1398
|
|
1360
|
-
|
1361
|
-
|
1362
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
1399
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1400
|
+
int sumi = 0;
|
1363
1401
|
|
1364
|
-
|
1365
|
-
|
1366
|
-
|
1402
|
+
#pragma unroll
|
1403
|
+
for (int i = 0; i < vdr; ++i) {
|
1404
|
+
const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
|
1405
|
+
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
1367
1406
|
|
1368
|
-
|
1369
|
-
|
1407
|
+
// SIMD dot product of quantized values
|
1408
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi);
|
1409
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1410
|
+
}
|
1370
1411
|
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1412
|
+
#ifdef GGML_CUDA_F16
|
1413
|
+
const half2 tmp = __hmul2(dm4, ds8);
|
1414
|
+
const float d4d8 = __half2float(tmp.x);
|
1415
|
+
const float m4s8 = __half2float(tmp.y);
|
1416
|
+
#else
|
1417
|
+
const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
|
1418
|
+
const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
|
1419
|
+
#endif // GGML_CUDA_F16
|
1374
1420
|
|
1375
|
-
|
1421
|
+
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1422
|
+
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1376
1423
|
#else
|
1377
1424
|
return 0.0f; // only to satisfy the compiler
|
1378
1425
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1379
1426
|
}
|
1380
1427
|
|
1381
|
-
|
1382
|
-
|
1428
|
+
#define VDR_Q5_0_Q8_1_MMVQ 2
|
1429
|
+
#define VDR_Q5_0_Q8_1_MMQ 4
|
1430
|
+
|
1431
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
|
1432
|
+
const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
|
1433
|
+
|
1383
1434
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1384
|
-
|
1435
|
+
int sumi = 0;
|
1436
|
+
|
1437
|
+
for (int i = 0; i < vdr; ++i) {
|
1438
|
+
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1439
|
+
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
1440
|
+
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
1441
|
+
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
1442
|
+
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
1443
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
|
1444
|
+
|
1445
|
+
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
1446
|
+
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
1447
|
+
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
1448
|
+
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
1449
|
+
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
1450
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1451
|
+
}
|
1385
1452
|
|
1386
|
-
|
1387
|
-
|
1388
|
-
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1389
|
-
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1390
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1391
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
1392
|
-
|
1393
|
-
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
1394
|
-
|
1395
|
-
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1396
|
-
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1397
|
-
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1398
|
-
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1399
|
-
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1400
|
-
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
1401
|
-
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1402
|
-
|
1403
|
-
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1404
|
-
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1405
|
-
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1406
|
-
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1407
|
-
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1408
|
-
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
1409
|
-
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1410
|
-
|
1411
|
-
return sumi*d;
|
1453
|
+
// second part effectively subtracts 16 from each quant value
|
1454
|
+
return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y));
|
1412
1455
|
#else
|
1413
1456
|
return 0.0f; // only to satisfy the compiler
|
1414
1457
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1415
1458
|
}
|
1416
1459
|
|
1417
|
-
|
1418
|
-
|
1460
|
+
#define VDR_Q5_1_Q8_1_MMVQ 2
|
1461
|
+
#define VDR_Q5_1_Q8_1_MMQ 4
|
1462
|
+
|
1463
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
|
1464
|
+
const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
|
1465
|
+
|
1419
1466
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1420
|
-
|
1467
|
+
int sumi = 0;
|
1468
|
+
|
1469
|
+
for (int i = 0; i < vdr; ++i) {
|
1470
|
+
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1471
|
+
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
1472
|
+
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
1473
|
+
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
1474
|
+
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
1475
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
|
1476
|
+
|
1477
|
+
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
1478
|
+
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
1479
|
+
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
1480
|
+
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
1481
|
+
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
1482
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1483
|
+
}
|
1484
|
+
|
1485
|
+
#ifdef GGML_CUDA_F16
|
1486
|
+
const half2 tmp = __hmul2(dm5, ds8);
|
1487
|
+
const float d5d8 = __half2float(tmp.x);
|
1488
|
+
const float m5s8 = __half2float(tmp.y);
|
1489
|
+
#else
|
1490
|
+
const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
|
1491
|
+
const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
|
1492
|
+
#endif // GGML_CUDA_F16
|
1493
|
+
|
1494
|
+
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
1495
|
+
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1421
1496
|
|
1422
|
-
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
1423
|
-
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1424
|
-
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1425
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1426
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
1427
|
-
|
1428
|
-
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
1429
|
-
const float m = bq5_1->m;
|
1430
|
-
const float s = bq8_1->s;
|
1431
|
-
|
1432
|
-
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1433
|
-
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1434
|
-
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1435
|
-
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1436
|
-
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1437
|
-
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1438
|
-
|
1439
|
-
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1440
|
-
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1441
|
-
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1442
|
-
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1443
|
-
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1444
|
-
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1445
|
-
|
1446
|
-
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1447
1497
|
#else
|
1448
1498
|
return 0.0f; // only to satisfy the compiler
|
1449
1499
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1450
1500
|
}
|
1451
1501
|
|
1452
|
-
|
1453
|
-
|
1454
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1455
|
-
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1502
|
+
#define VDR_Q8_0_Q8_1_MMVQ 2
|
1503
|
+
#define VDR_Q8_0_Q8_1_MMQ 8
|
1456
1504
|
|
1457
|
-
|
1458
|
-
|
1459
|
-
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1505
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
|
1506
|
+
const int * v, const int * u, const float & d8_0, const half2 & ds8_1) {
|
1460
1507
|
|
1461
|
-
|
1508
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1509
|
+
int sumi = 0;
|
1462
1510
|
|
1463
|
-
|
1464
|
-
|
1511
|
+
for (int i = 0; i < vdr; ++i) {
|
1512
|
+
// SIMD dot product of quantized values
|
1513
|
+
sumi = __dp4a(v[i], u[i], sumi);
|
1514
|
+
}
|
1465
1515
|
|
1466
|
-
return sumi*
|
1516
|
+
return sumi * d8_0 * __half2float(ds8_1.x);
|
1467
1517
|
#else
|
1468
1518
|
return 0.0f; // only to satisfy the compiler
|
1469
1519
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1470
1520
|
}
|
1471
1521
|
|
1472
|
-
static __device__ __forceinline__ float
|
1473
|
-
const
|
1522
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
|
1523
|
+
const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
|
1474
1524
|
|
1475
1525
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1476
|
-
|
1526
|
+
int sumi = 0;
|
1477
1527
|
|
1478
|
-
|
1479
|
-
|
1528
|
+
for (int i = 0; i < vdr; ++i) {
|
1529
|
+
// SIMD dot product of quantized values
|
1530
|
+
sumi = __dp4a(v[i], u[i], sumi);
|
1531
|
+
}
|
1480
1532
|
|
1481
|
-
|
1482
|
-
|
1533
|
+
#ifdef GGML_CUDA_F16
|
1534
|
+
const half2 tmp = __hmul2(dm8, ds8);
|
1535
|
+
const float d8d8 = __half2float(tmp.x);
|
1536
|
+
const float m8s8 = __half2float(tmp.y);
|
1537
|
+
#else
|
1538
|
+
const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x);
|
1539
|
+
const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y);
|
1540
|
+
#endif // GGML_CUDA_F16
|
1483
1541
|
|
1484
|
-
|
1485
|
-
|
1542
|
+
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1543
|
+
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1544
|
+
#else
|
1545
|
+
return 0.0f; // only to satisfy the compiler
|
1546
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1547
|
+
}
|
1486
1548
|
|
1487
|
-
|
1549
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1550
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1488
1551
|
|
1489
|
-
|
1490
|
-
const int sc = bq2_K->scales[scale_offset + 2*i];
|
1552
|
+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1491
1553
|
|
1492
|
-
|
1493
|
-
|
1554
|
+
int v[VDR_Q4_0_Q8_1_MMVQ];
|
1555
|
+
int u[2*VDR_Q4_0_Q8_1_MMVQ];
|
1494
1556
|
|
1495
|
-
|
1496
|
-
|
1557
|
+
#pragma unroll
|
1558
|
+
for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
|
1559
|
+
v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
|
1560
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1561
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
|
1562
|
+
}
|
1563
|
+
|
1564
|
+
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
1565
|
+
}
|
1566
|
+
|
1567
|
+
static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1568
|
+
|
1569
|
+
__shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
1570
|
+
__shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
|
1571
|
+
|
1572
|
+
*x_ql = tile_x_qs;
|
1573
|
+
*x_dm = (half2 *) tile_x_d;
|
1574
|
+
}
|
1575
|
+
|
1576
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1577
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1578
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1579
|
+
|
1580
|
+
__builtin_assume(i_offset >= 0);
|
1581
|
+
__builtin_assume(i_offset < 8);
|
1582
|
+
__builtin_assume(k >= 0);
|
1583
|
+
__builtin_assume(k < WARP_SIZE);
|
1584
|
+
|
1585
|
+
const int kbx = k / QI4_0;
|
1586
|
+
const int kqsx = k % QI4_0;
|
1587
|
+
|
1588
|
+
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
1589
|
+
|
1590
|
+
float * x_dmf = (float *) x_dm;
|
1591
|
+
|
1592
|
+
#pragma unroll
|
1593
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1594
|
+
int i = i0 + i_offset;
|
1497
1595
|
|
1498
|
-
|
1499
|
-
|
1596
|
+
if (need_check) {
|
1597
|
+
i = min(i, i_max);
|
1598
|
+
}
|
1599
|
+
|
1600
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1601
|
+
|
1602
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
1603
|
+
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1500
1604
|
}
|
1501
1605
|
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1606
|
+
// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
|
1607
|
+
// const int kbxd = k % blocks_per_tile_x_row;
|
1608
|
+
|
1609
|
+
// #pragma unroll
|
1610
|
+
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
|
1611
|
+
// FIXME out-of-bounds
|
1612
|
+
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1613
|
+
|
1614
|
+
// if (i >= GGML_CUDA_MMQ_Y) {
|
1615
|
+
// return;
|
1616
|
+
// }
|
1617
|
+
|
1618
|
+
// const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1619
|
+
|
1620
|
+
// x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
|
1621
|
+
// }
|
1506
1622
|
}
|
1507
1623
|
|
1508
|
-
static __device__ __forceinline__ float
|
1509
|
-
const
|
1624
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
1625
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1626
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1510
1627
|
|
1511
|
-
|
1512
|
-
|
1628
|
+
__builtin_assume(i >= 0);
|
1629
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1630
|
+
__builtin_assume(j >= 0);
|
1631
|
+
__builtin_assume(j < WARP_SIZE);
|
1632
|
+
__builtin_assume(k >= 0);
|
1633
|
+
__builtin_assume(k < WARP_SIZE);
|
1513
1634
|
|
1514
|
-
const int
|
1515
|
-
const
|
1635
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1636
|
+
const float * x_dmf = (float *) x_dm;
|
1516
1637
|
|
1517
|
-
|
1638
|
+
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
1518
1639
|
|
1519
|
-
|
1640
|
+
#pragma unroll
|
1641
|
+
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
|
1642
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1643
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
|
1644
|
+
}
|
1520
1645
|
|
1521
|
-
|
1522
|
-
|
1646
|
+
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
|
1647
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
|
1648
|
+
y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1649
|
+
}
|
1523
1650
|
|
1524
|
-
|
1525
|
-
|
1526
|
-
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
|
1527
|
-
vh >>= bq8_offset;
|
1651
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1652
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1528
1653
|
|
1529
|
-
|
1530
|
-
const int isc = scale_offset + 2*i;
|
1654
|
+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1531
1655
|
|
1532
|
-
|
1533
|
-
|
1534
|
-
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
|
1656
|
+
int v[VDR_Q4_1_Q8_1_MMVQ];
|
1657
|
+
int u[2*VDR_Q4_1_Q8_1_MMVQ];
|
1535
1658
|
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1659
|
+
#pragma unroll
|
1660
|
+
for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
|
1661
|
+
v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
|
1662
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1663
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
|
1664
|
+
}
|
1539
1665
|
|
1540
|
-
|
1666
|
+
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
1667
|
+
}
|
1541
1668
|
|
1542
|
-
|
1543
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1544
|
-
const float d8i = bq8i->d;
|
1669
|
+
static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1545
1670
|
|
1546
|
-
|
1671
|
+
__shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
|
1672
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
|
1547
1673
|
|
1548
|
-
|
1674
|
+
*x_ql = tile_x_qs;
|
1675
|
+
*x_dm = tile_x_dm;
|
1676
|
+
}
|
1549
1677
|
|
1550
|
-
|
1678
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
1679
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1680
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1681
|
+
|
1682
|
+
__builtin_assume(i_offset >= 0);
|
1683
|
+
__builtin_assume(i_offset < 8);
|
1684
|
+
__builtin_assume(k >= 0);
|
1685
|
+
__builtin_assume(k < WARP_SIZE);
|
1686
|
+
|
1687
|
+
const int kbx = k / QI4_1;
|
1688
|
+
const int kqsx = k % QI4_1;
|
1689
|
+
|
1690
|
+
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
1551
1691
|
|
1552
|
-
|
1692
|
+
#pragma unroll
|
1693
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1694
|
+
int i = i0 + i_offset;
|
1695
|
+
|
1696
|
+
if (need_check) {
|
1697
|
+
i = min(i, i_max);
|
1698
|
+
}
|
1699
|
+
|
1700
|
+
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
|
1701
|
+
|
1702
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
1553
1703
|
}
|
1554
1704
|
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
#
|
1705
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
|
1706
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1707
|
+
|
1708
|
+
#pragma unroll
|
1709
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
|
1710
|
+
int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
|
1711
|
+
|
1712
|
+
if (need_check) {
|
1713
|
+
i = min(i, i_max);
|
1714
|
+
}
|
1715
|
+
|
1716
|
+
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1717
|
+
|
1718
|
+
x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
|
1719
|
+
}
|
1559
1720
|
}
|
1560
1721
|
|
1561
|
-
static __device__ __forceinline__ float
|
1562
|
-
const
|
1722
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
1723
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1724
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1563
1725
|
|
1564
|
-
|
1565
|
-
|
1726
|
+
__builtin_assume(i >= 0);
|
1727
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1728
|
+
__builtin_assume(j >= 0);
|
1729
|
+
__builtin_assume(j < WARP_SIZE);
|
1730
|
+
__builtin_assume(k >= 0);
|
1731
|
+
__builtin_assume(k < WARP_SIZE);
|
1566
1732
|
|
1567
|
-
|
1568
|
-
float sumf_m = 0.0f;
|
1733
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1569
1734
|
|
1570
|
-
|
1735
|
+
int u[2*VDR_Q4_1_Q8_1_MMQ];
|
1571
1736
|
|
1572
|
-
|
1573
|
-
|
1737
|
+
#pragma unroll
|
1738
|
+
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
|
1739
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1740
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
|
1741
|
+
}
|
1574
1742
|
|
1575
|
-
|
1576
|
-
|
1743
|
+
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
|
1744
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
|
1745
|
+
y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1746
|
+
}
|
1577
1747
|
|
1578
|
-
|
1579
|
-
|
1580
|
-
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
1581
|
-
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
1748
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
1749
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1582
1750
|
|
1583
|
-
const
|
1584
|
-
const int v1 = q4[0];
|
1585
|
-
const int v2 = q4[4];
|
1751
|
+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
1586
1752
|
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1590
|
-
|
1591
|
-
|
1592
|
-
|
1593
|
-
|
1594
|
-
|
1595
|
-
|
1753
|
+
int vl[VDR_Q5_0_Q8_1_MMVQ];
|
1754
|
+
int vh[VDR_Q5_0_Q8_1_MMVQ];
|
1755
|
+
int u[2*VDR_Q5_0_Q8_1_MMVQ];
|
1756
|
+
|
1757
|
+
#pragma unroll
|
1758
|
+
for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
|
1759
|
+
vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
|
1760
|
+
vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
|
1761
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1762
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
|
1596
1763
|
}
|
1597
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
1598
|
-
const uint8_t * m = sc + 2;
|
1599
1764
|
|
1600
|
-
|
1765
|
+
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
|
1766
|
+
}
|
1767
|
+
|
1768
|
+
static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1769
|
+
|
1770
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
1771
|
+
__shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
|
1772
|
+
|
1773
|
+
*x_ql = tile_x_ql;
|
1774
|
+
*x_dm = (half2 *) tile_x_d;
|
1775
|
+
}
|
1776
|
+
|
1777
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
1778
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1779
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1780
|
+
|
1781
|
+
__builtin_assume(i_offset >= 0);
|
1782
|
+
__builtin_assume(i_offset < 8);
|
1783
|
+
__builtin_assume(k >= 0);
|
1784
|
+
__builtin_assume(k < WARP_SIZE);
|
1785
|
+
|
1786
|
+
const int kbx = k / QI5_0;
|
1787
|
+
const int kqsx = k % QI5_0;
|
1788
|
+
|
1789
|
+
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
1790
|
+
|
1791
|
+
#pragma unroll
|
1792
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1793
|
+
int i = i0 + i_offset;
|
1794
|
+
|
1795
|
+
if (need_check) {
|
1796
|
+
i = min(i, i_max);
|
1797
|
+
}
|
1798
|
+
|
1799
|
+
const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1800
|
+
|
1801
|
+
const int ql = get_int_from_uint8(bxi->qs, kqsx);
|
1802
|
+
const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
|
1803
|
+
|
1804
|
+
int qs0 = (ql >> 0) & 0x0F0F0F0F;
|
1805
|
+
qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
|
1806
|
+
qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
|
1807
|
+
qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
|
1808
|
+
qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
|
1809
|
+
qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
|
1810
|
+
|
1811
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
|
1812
|
+
|
1813
|
+
int qs1 = (ql >> 4) & 0x0F0F0F0F;
|
1814
|
+
qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
|
1815
|
+
qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
|
1816
|
+
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
1817
|
+
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
1818
|
+
qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
|
1819
|
+
|
1820
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
|
1821
|
+
}
|
1822
|
+
|
1823
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
|
1824
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1825
|
+
float * x_dmf = (float *) x_dm;
|
1826
|
+
|
1827
|
+
#pragma unroll
|
1828
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
|
1829
|
+
int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
|
1830
|
+
|
1831
|
+
if (need_check) {
|
1832
|
+
i = min(i, i_max);
|
1833
|
+
}
|
1834
|
+
|
1835
|
+
const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1836
|
+
|
1837
|
+
x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
|
1838
|
+
}
|
1839
|
+
}
|
1840
|
+
|
1841
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
1842
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1843
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1844
|
+
|
1845
|
+
__builtin_assume(i >= 0);
|
1846
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1847
|
+
__builtin_assume(j >= 0);
|
1848
|
+
__builtin_assume(j < WARP_SIZE);
|
1849
|
+
__builtin_assume(k >= 0);
|
1850
|
+
__builtin_assume(k < WARP_SIZE);
|
1851
|
+
|
1852
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1853
|
+
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
1854
|
+
const float * x_dmf = (float *) x_dm;
|
1855
|
+
|
1856
|
+
int u[2*VDR_Q5_0_Q8_1_MMQ];
|
1857
|
+
|
1858
|
+
#pragma unroll
|
1859
|
+
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
|
1860
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1861
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
|
1865
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1866
|
+
}
|
1867
|
+
|
1868
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
1869
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1870
|
+
|
1871
|
+
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
1872
|
+
|
1873
|
+
int vl[VDR_Q5_1_Q8_1_MMVQ];
|
1874
|
+
int vh[VDR_Q5_1_Q8_1_MMVQ];
|
1875
|
+
int u[2*VDR_Q5_1_Q8_1_MMVQ];
|
1876
|
+
|
1877
|
+
#pragma unroll
|
1878
|
+
for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
|
1879
|
+
vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
|
1880
|
+
vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
|
1881
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1882
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
|
1883
|
+
}
|
1884
|
+
|
1885
|
+
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
|
1886
|
+
}
|
1887
|
+
|
1888
|
+
static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1889
|
+
|
1890
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
1891
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
|
1892
|
+
|
1893
|
+
*x_ql = tile_x_ql;
|
1894
|
+
*x_dm = tile_x_dm;
|
1895
|
+
}
|
1896
|
+
|
1897
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
1898
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1899
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1900
|
+
|
1901
|
+
__builtin_assume(i_offset >= 0);
|
1902
|
+
__builtin_assume(i_offset < 8);
|
1903
|
+
__builtin_assume(k >= 0);
|
1904
|
+
__builtin_assume(k < WARP_SIZE);
|
1905
|
+
|
1906
|
+
const int kbx = k / QI5_1;
|
1907
|
+
const int kqsx = k % QI5_1;
|
1908
|
+
|
1909
|
+
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
1910
|
+
|
1911
|
+
#pragma unroll
|
1912
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
1913
|
+
int i = i0 + i_offset;
|
1914
|
+
|
1915
|
+
if (need_check) {
|
1916
|
+
i = min(i, i_max);
|
1917
|
+
}
|
1918
|
+
|
1919
|
+
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
|
1920
|
+
|
1921
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
1922
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
|
1923
|
+
|
1924
|
+
int qs0 = (ql >> 0) & 0x0F0F0F0F;
|
1925
|
+
qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
|
1926
|
+
qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
|
1927
|
+
qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
|
1928
|
+
qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
|
1929
|
+
|
1930
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
|
1931
|
+
|
1932
|
+
int qs1 = (ql >> 4) & 0x0F0F0F0F;
|
1933
|
+
qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
|
1934
|
+
qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
|
1935
|
+
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
1936
|
+
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
1937
|
+
|
1938
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
|
1939
|
+
}
|
1940
|
+
|
1941
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
|
1942
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1943
|
+
|
1944
|
+
#pragma unroll
|
1945
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
|
1946
|
+
int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
|
1947
|
+
|
1948
|
+
if (need_check) {
|
1949
|
+
i = min(i, i_max);
|
1950
|
+
}
|
1951
|
+
|
1952
|
+
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1953
|
+
|
1954
|
+
x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
|
1955
|
+
}
|
1956
|
+
}
|
1957
|
+
|
1958
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
1959
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1960
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1961
|
+
|
1962
|
+
__builtin_assume(i >= 0);
|
1963
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1964
|
+
__builtin_assume(j >= 0);
|
1965
|
+
__builtin_assume(j < WARP_SIZE);
|
1966
|
+
__builtin_assume(k >= 0);
|
1967
|
+
__builtin_assume(k < WARP_SIZE);
|
1968
|
+
|
1969
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1970
|
+
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
1971
|
+
|
1972
|
+
int u[2*VDR_Q5_1_Q8_1_MMQ];
|
1973
|
+
|
1974
|
+
#pragma unroll
|
1975
|
+
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
|
1976
|
+
u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
|
1977
|
+
u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
|
1978
|
+
}
|
1979
|
+
|
1980
|
+
return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
|
1981
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
|
1982
|
+
}
|
1983
|
+
|
1984
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
1985
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1986
|
+
|
1987
|
+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1988
|
+
|
1989
|
+
int v[VDR_Q8_0_Q8_1_MMVQ];
|
1990
|
+
int u[VDR_Q8_0_Q8_1_MMVQ];
|
1991
|
+
|
1992
|
+
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
|
1993
|
+
v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
|
1994
|
+
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1995
|
+
}
|
1996
|
+
|
1997
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
|
1998
|
+
}
|
1999
|
+
|
2000
|
+
static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2001
|
+
|
2002
|
+
__shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2003
|
+
__shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
|
2004
|
+
|
2005
|
+
*x_ql = tile_x_qs;
|
2006
|
+
*x_dm = (half2 *) tile_x_d;
|
2007
|
+
}
|
2008
|
+
|
2009
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2010
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2011
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2012
|
+
|
2013
|
+
__builtin_assume(i_offset >= 0);
|
2014
|
+
__builtin_assume(i_offset < 8);
|
2015
|
+
__builtin_assume(k >= 0);
|
2016
|
+
__builtin_assume(k < WARP_SIZE);
|
2017
|
+
|
2018
|
+
const int kbx = k / QI8_0;
|
2019
|
+
const int kqsx = k % QI8_0;
|
2020
|
+
float * x_dmf = (float *) x_dm;
|
2021
|
+
|
2022
|
+
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2023
|
+
|
2024
|
+
#pragma unroll
|
2025
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2026
|
+
int i = i0 + i_offset;
|
2027
|
+
|
2028
|
+
if (need_check) {
|
2029
|
+
i = min(i, i_max);
|
2030
|
+
}
|
2031
|
+
|
2032
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
2033
|
+
|
2034
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
|
2035
|
+
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
|
2036
|
+
}
|
2037
|
+
|
2038
|
+
// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
|
2039
|
+
// const int kbxd = k % blocks_per_tile_x_row;
|
2040
|
+
|
2041
|
+
// #pragma unroll
|
2042
|
+
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
|
2043
|
+
// FIXME out-of-bounds
|
2044
|
+
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2045
|
+
|
2046
|
+
// #if GGML_CUDA_MMQ_Y < 64
|
2047
|
+
// if (i >= GGML_CUDA_MMQ_Y) {
|
2048
|
+
// return;
|
2049
|
+
// }
|
2050
|
+
// #endif // GGML_CUDA_MMQ_Y < 64
|
2051
|
+
|
2052
|
+
// const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2053
|
+
|
2054
|
+
// x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
|
2055
|
+
// }
|
2056
|
+
}
|
2057
|
+
|
2058
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2059
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2060
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2061
|
+
|
2062
|
+
__builtin_assume(i >= 0);
|
2063
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2064
|
+
__builtin_assume(j >= 0);
|
2065
|
+
__builtin_assume(j < WARP_SIZE);
|
2066
|
+
__builtin_assume(k >= 0);
|
2067
|
+
__builtin_assume(k < WARP_SIZE);
|
2068
|
+
|
2069
|
+
const float * x_dmf = (float *) x_dm;
|
2070
|
+
|
2071
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
|
2072
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
|
2073
|
+
y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
|
2074
|
+
}
|
2075
|
+
|
2076
|
+
#define VDR_q2_K_q8_1 1
|
2077
|
+
|
2078
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
|
2079
|
+
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2080
|
+
const half2 & dm, const float * __restrict__ d8) {
|
2081
|
+
|
2082
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2083
|
+
float sumf_d = 0.0f;
|
2084
|
+
float sumf_m = 0.0f;
|
2085
|
+
|
2086
|
+
for (int i = 0; i < QR2_K; ++i) {
|
2087
|
+
const int sc = scales[2*i];
|
2088
|
+
|
2089
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
2090
|
+
|
2091
|
+
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
2092
|
+
|
2093
|
+
int sc_high = sc >> 4;
|
2094
|
+
sc_high |= sc_high << 8;
|
2095
|
+
sc_high |= sc_high << 16;
|
2096
|
+
sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
2097
|
+
}
|
2098
|
+
|
2099
|
+
const float2 dmf = __half22float2(dm);
|
2100
|
+
|
2101
|
+
return dmf.x*sumf_d - dmf.y*sumf_m;
|
2102
|
+
#else
|
2103
|
+
return 0.0f; // only to satisfy the compiler
|
2104
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2105
|
+
}
|
2106
|
+
|
2107
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
2108
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2109
|
+
|
2110
|
+
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
2111
|
+
|
2112
|
+
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
2113
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
2114
|
+
|
2115
|
+
const uint8_t * scales = bq2_K->scales + scale_offset;
|
2116
|
+
|
2117
|
+
const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
|
2118
|
+
int u[QR2_K];
|
2119
|
+
float d8[QR2_K];
|
2120
|
+
|
2121
|
+
for (int i = 0; i < QR2_K; ++ i) {
|
2122
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2123
|
+
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2124
|
+
}
|
2125
|
+
|
2126
|
+
return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8);
|
2127
|
+
}
|
2128
|
+
|
2129
|
+
static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2130
|
+
|
2131
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2132
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
|
2133
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
|
2134
|
+
|
2135
|
+
*x_ql = tile_x_ql;
|
2136
|
+
*x_dm = tile_x_dm;
|
2137
|
+
*x_sc = tile_x_sc;
|
2138
|
+
}
|
2139
|
+
|
2140
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2141
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2142
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2143
|
+
|
2144
|
+
__builtin_assume(i_offset >= 0);
|
2145
|
+
__builtin_assume(i_offset < 8);
|
2146
|
+
__builtin_assume(k >= 0);
|
2147
|
+
__builtin_assume(k < WARP_SIZE);
|
2148
|
+
|
2149
|
+
const int kbx = k / QI2_K;
|
2150
|
+
const int kqsx = k % QI2_K;
|
2151
|
+
|
2152
|
+
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2153
|
+
|
2154
|
+
#pragma unroll
|
2155
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2156
|
+
int i = i0 + i_offset;
|
2157
|
+
|
2158
|
+
if (need_check) {
|
2159
|
+
i = min(i, i_max);
|
2160
|
+
}
|
2161
|
+
|
2162
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2163
|
+
|
2164
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2165
|
+
}
|
2166
|
+
|
2167
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
|
2168
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2169
|
+
|
2170
|
+
#pragma unroll
|
2171
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
|
2172
|
+
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2173
|
+
|
2174
|
+
if (need_check) {
|
2175
|
+
i = min(i, i_max);
|
2176
|
+
}
|
2177
|
+
|
2178
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2179
|
+
|
2180
|
+
x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
|
2181
|
+
}
|
2182
|
+
|
2183
|
+
#pragma unroll
|
2184
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
|
2185
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2186
|
+
|
2187
|
+
if (need_check) {
|
2188
|
+
i = min(i, i_max);
|
2189
|
+
}
|
2190
|
+
|
2191
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
|
2192
|
+
|
2193
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
|
2194
|
+
}
|
2195
|
+
}
|
2196
|
+
|
2197
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
2198
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2199
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2200
|
+
|
2201
|
+
__builtin_assume(i >= 0);
|
2202
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2203
|
+
__builtin_assume(j >= 0);
|
2204
|
+
__builtin_assume(j < WARP_SIZE);
|
2205
|
+
__builtin_assume(k >= 0);
|
2206
|
+
__builtin_assume(k < WARP_SIZE);
|
2207
|
+
|
2208
|
+
const int kbx = k / QI2_K;
|
2209
|
+
const int kqsx = k % QI2_K;
|
2210
|
+
|
2211
|
+
const int bq8_offset = QR2_K * (kqsx / QI8_1);
|
2212
|
+
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2213
|
+
|
2214
|
+
const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset;
|
2215
|
+
|
2216
|
+
int u[QR2_K];
|
2217
|
+
float d8[QR2_K];
|
2218
|
+
|
2219
|
+
for (int l = 0; l < QR2_K; ++ l) {
|
2220
|
+
const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2221
|
+
u[l] = y_qs[y_qs_index];
|
2222
|
+
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2223
|
+
}
|
2224
|
+
|
2225
|
+
return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8);
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
#define VDR_q3_K_q8_1 1
|
2229
|
+
|
2230
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
|
2231
|
+
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2232
|
+
const int & scale_offset, const float & d, const float * __restrict__ d8) {
|
2233
|
+
|
2234
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2235
|
+
float sumf = 0.0f;
|
2236
|
+
|
2237
|
+
for (int i = 0; i < QR3_K; ++i) {
|
2238
|
+
const int isc = scale_offset + 2*i;
|
2239
|
+
|
2240
|
+
const int isc_low = isc % (QK_K/32);
|
2241
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
2242
|
+
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
2243
|
+
|
2244
|
+
const int isc_high = isc % (QK_K/64);
|
2245
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
2246
|
+
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
2247
|
+
|
2248
|
+
const int sc = (sc_low | sc_high) - 32;
|
2249
|
+
|
2250
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
2251
|
+
|
2252
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
2253
|
+
|
2254
|
+
const int vi = __vsubss4(vil, vih);
|
2255
|
+
|
2256
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
2257
|
+
}
|
2258
|
+
|
2259
|
+
return d*sumf;
|
2260
|
+
#else
|
2261
|
+
return 0.0f; // only to satisfy the compiler
|
2262
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2263
|
+
}
|
2264
|
+
|
2265
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
2266
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2267
|
+
|
2268
|
+
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
2269
|
+
|
2270
|
+
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
2271
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
2272
|
+
|
2273
|
+
const float d = bq3_K->d;
|
2274
|
+
|
2275
|
+
const int vl = get_int_from_uint8(bq3_K->qs, iqs);
|
2276
|
+
|
2277
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2278
|
+
const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
|
2279
|
+
|
2280
|
+
int u[QR3_K];
|
2281
|
+
float d8[QR3_K];
|
2282
|
+
|
2283
|
+
for (int i = 0; i < QR3_K; ++i) {
|
2284
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2285
|
+
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2286
|
+
}
|
2287
|
+
|
2288
|
+
return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
2289
|
+
}
|
2290
|
+
|
2291
|
+
static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2292
|
+
|
2293
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2294
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
|
2295
|
+
__shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
|
2296
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
|
2297
|
+
|
2298
|
+
*x_ql = tile_x_ql;
|
2299
|
+
*x_dm = tile_x_dm;
|
2300
|
+
*x_qh = tile_x_qh;
|
2301
|
+
*x_sc = tile_x_sc;
|
2302
|
+
}
|
2303
|
+
|
2304
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2305
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2306
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2307
|
+
|
2308
|
+
__builtin_assume(i_offset >= 0);
|
2309
|
+
__builtin_assume(i_offset < 8);
|
2310
|
+
__builtin_assume(k >= 0);
|
2311
|
+
__builtin_assume(k < WARP_SIZE);
|
2312
|
+
|
2313
|
+
const int kbx = k / QI3_K;
|
2314
|
+
const int kqsx = k % QI3_K;
|
2315
|
+
|
2316
|
+
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2317
|
+
|
2318
|
+
#pragma unroll
|
2319
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2320
|
+
int i = i0 + i_offset;
|
2321
|
+
|
2322
|
+
if (need_check) {
|
2323
|
+
i = min(i, i_max);
|
2324
|
+
}
|
2325
|
+
|
2326
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2327
|
+
|
2328
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
|
2332
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2333
|
+
|
2334
|
+
#pragma unroll
|
2335
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
|
2336
|
+
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2337
|
+
|
2338
|
+
if (need_check) {
|
2339
|
+
i = min(i, i_max);
|
2340
|
+
}
|
2341
|
+
|
2342
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2343
|
+
|
2344
|
+
x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d;
|
2345
|
+
}
|
2346
|
+
|
2347
|
+
#pragma unroll
|
2348
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
|
2349
|
+
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
2350
|
+
|
2351
|
+
if (need_check) {
|
2352
|
+
i = min(i, i_max);
|
2353
|
+
}
|
2354
|
+
|
2355
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
|
2356
|
+
|
2357
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
|
2358
|
+
}
|
2359
|
+
|
2360
|
+
#pragma unroll
|
2361
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
|
2362
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2363
|
+
|
2364
|
+
if (need_check) {
|
2365
|
+
i = min(i, i_max);
|
2366
|
+
}
|
2367
|
+
|
2368
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
|
2369
|
+
|
2370
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4));
|
2371
|
+
}
|
2372
|
+
}
|
2373
|
+
|
2374
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
2375
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2376
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2377
|
+
|
2378
|
+
__builtin_assume(i >= 0);
|
2379
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2380
|
+
__builtin_assume(j >= 0);
|
2381
|
+
__builtin_assume(j < WARP_SIZE);
|
2382
|
+
__builtin_assume(k >= 0);
|
2383
|
+
__builtin_assume(k < WARP_SIZE);
|
2384
|
+
|
2385
|
+
const int kbx = k / QI3_K;
|
2386
|
+
const int kqsx = k % QI3_K;
|
2387
|
+
|
2388
|
+
const int bq8_offset = QR3_K * (kqsx / (QI3_K/2));
|
2389
|
+
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2390
|
+
|
2391
|
+
const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
|
2392
|
+
|
2393
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2394
|
+
const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
|
2395
|
+
|
2396
|
+
int u[QR3_K];
|
2397
|
+
float d8[QR3_K];
|
2398
|
+
|
2399
|
+
for (int l = 0; l < QR3_K; ++ l) {
|
2400
|
+
const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2401
|
+
u[l] = y_qs[y_qs_index];
|
2402
|
+
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2403
|
+
}
|
2404
|
+
|
2405
|
+
return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
|
2406
|
+
x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
|
2407
|
+
}
|
2408
|
+
|
2409
|
+
#define VDR_q4_K_q8_1 2
|
2410
|
+
|
2411
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
|
2412
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2413
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
2414
|
+
|
2415
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2416
|
+
float sumf_d = 0.0f;
|
2417
|
+
float sumf_m = 0.0f;
|
2418
|
+
|
2419
|
+
for (int i = 0; i < QR4_K; ++i) {
|
2420
|
+
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
|
2421
|
+
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
|
2422
|
+
|
2423
|
+
const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
|
2424
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
|
2425
|
+
|
2426
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
2427
|
+
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
2428
|
+
}
|
2429
|
+
|
2430
|
+
return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m;
|
2431
|
+
|
2432
|
+
#else
|
2433
|
+
return 0.0f; // only to satisfy the compiler
|
2434
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2435
|
+
}
|
2436
|
+
|
2437
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
2438
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2439
|
+
|
2440
|
+
#ifndef GGML_QKK_64
|
2441
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
2442
|
+
|
2443
|
+
int v[2];
|
2444
|
+
int u[2*QR4_K];
|
2445
|
+
float d8[QR4_K];
|
2446
|
+
|
2447
|
+
// iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
|
2448
|
+
const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
|
2449
|
+
|
2450
|
+
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
2451
|
+
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
2452
|
+
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
2453
|
+
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
2454
|
+
|
2455
|
+
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
|
2456
|
+
v[0] = q4[0];
|
2457
|
+
v[1] = q4[4];
|
2458
|
+
|
2459
|
+
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
2460
|
+
uint16_t aux[2];
|
2461
|
+
const int j = bq8_offset/2;
|
2462
|
+
if (j < 2) {
|
2463
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
2464
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
2465
|
+
} else {
|
2466
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
2467
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
2468
|
+
}
|
2469
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2470
|
+
const uint8_t * m = sc + 2;
|
2471
|
+
|
2472
|
+
for (int i = 0; i < QR4_K; ++i) {
|
2473
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2474
|
+
d8[i] = bq8i->ds.x;
|
2475
|
+
|
2476
|
+
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2477
|
+
u[2*i+0] = q8[0];
|
2478
|
+
u[2*i+1] = q8[4];
|
2479
|
+
}
|
2480
|
+
|
2481
|
+
return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8);
|
2482
|
+
|
2483
|
+
#else
|
2484
|
+
|
2485
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2486
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
2487
|
+
|
2488
|
+
float sumf_d = 0.0f;
|
2489
|
+
float sumf_m = 0.0f;
|
2490
|
+
|
2491
|
+
uint16_t aux16[2];
|
2492
|
+
const uint8_t * s = (const uint8_t *)aux16;
|
2493
|
+
|
2494
|
+
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
2495
|
+
aux16[0] = a[0] & 0x0f0f;
|
2496
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2497
|
+
|
2498
|
+
const float dall = bq4_K->d[0];
|
2499
|
+
const float dmin = bq4_K->d[1];
|
2500
|
+
|
2501
|
+
const float d8_1 = bq8_1[0].ds.x;
|
2502
|
+
const float d8_2 = bq8_1[1].ds.x;
|
2503
|
+
|
2504
|
+
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2505
|
+
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
2506
|
+
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
2507
|
+
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
2508
|
+
|
2509
|
+
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
2510
|
+
const int v1 = q4[0];
|
2511
|
+
const int v2 = q4[4];
|
2512
|
+
|
2513
|
+
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
2514
|
+
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
2515
|
+
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
2516
|
+
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
2517
|
+
|
2518
|
+
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
2519
|
+
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
2520
|
+
|
2521
|
+
return dall * sumf_d - dmin * sumf_m;
|
2522
|
+
|
2523
|
+
#else
|
2524
|
+
return 0.0f; // only to satisfy the compiler
|
2525
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2526
|
+
|
2527
|
+
#endif
|
2528
|
+
}
|
2529
|
+
|
2530
|
+
static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2531
|
+
|
2532
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2533
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
|
2534
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
2535
|
+
|
2536
|
+
*x_ql = tile_x_ql;
|
2537
|
+
*x_dm = tile_x_dm;
|
2538
|
+
*x_sc = tile_x_sc;
|
2539
|
+
}
|
2540
|
+
|
2541
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2542
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2543
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2544
|
+
|
2545
|
+
__builtin_assume(i_offset >= 0);
|
2546
|
+
__builtin_assume(i_offset < 8);
|
2547
|
+
__builtin_assume(k >= 0);
|
2548
|
+
__builtin_assume(k < WARP_SIZE);
|
2549
|
+
|
2550
|
+
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
2551
|
+
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
2552
|
+
|
2553
|
+
const block_q4_K * bx0 = (block_q4_K *) vx;
|
2554
|
+
|
2555
|
+
#pragma unroll
|
2556
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2557
|
+
int i = i0 + i_offset;
|
2558
|
+
|
2559
|
+
if (need_check) {
|
2560
|
+
i = min(i, i_max);
|
2561
|
+
}
|
2562
|
+
|
2563
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2564
|
+
|
2565
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2566
|
+
}
|
2567
|
+
|
2568
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
2569
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2570
|
+
|
2571
|
+
#pragma unroll
|
2572
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
|
2573
|
+
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2574
|
+
|
2575
|
+
if (need_check) {
|
2576
|
+
i = min(i, i_max);
|
2577
|
+
}
|
2578
|
+
|
2579
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2580
|
+
|
2581
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2582
|
+
}
|
2583
|
+
|
2584
|
+
#pragma unroll
|
2585
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2586
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
2587
|
+
|
2588
|
+
if (need_check) {
|
2589
|
+
i = min(i, i_max);
|
2590
|
+
}
|
2591
|
+
|
2592
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
2593
|
+
|
2594
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
|
2595
|
+
}
|
2596
|
+
}
|
2597
|
+
|
2598
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2599
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2600
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2601
|
+
|
2602
|
+
__builtin_assume(i >= 0);
|
2603
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2604
|
+
__builtin_assume(j >= 0);
|
2605
|
+
__builtin_assume(j < WARP_SIZE);
|
2606
|
+
__builtin_assume(k >= 0);
|
2607
|
+
__builtin_assume(k < WARP_SIZE);
|
2608
|
+
|
2609
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2610
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2611
|
+
|
2612
|
+
int v[2];
|
2613
|
+
int u[2*QR4_K];
|
2614
|
+
float d8[QR4_K];
|
2615
|
+
|
2616
|
+
// kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6
|
2617
|
+
const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
|
2618
|
+
|
2619
|
+
v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
|
2620
|
+
v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
|
2621
|
+
|
2622
|
+
const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
|
2623
|
+
uint16_t aux[2];
|
2624
|
+
const int l = bq8_offset/2;
|
2625
|
+
if (l < 2) {
|
2626
|
+
aux[0] = scales[l+0] & 0x3f3f;
|
2627
|
+
aux[1] = scales[l+2] & 0x3f3f;
|
2628
|
+
} else {
|
2629
|
+
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2630
|
+
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2631
|
+
}
|
2632
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2633
|
+
const uint8_t * m = sc + 2;
|
2634
|
+
|
2635
|
+
for (int l = 0; l < QR4_K; ++l) {
|
2636
|
+
const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2637
|
+
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2638
|
+
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2639
|
+
d8[l] = y_ds[kqsy / QI8_1].x;
|
2640
|
+
}
|
2641
|
+
|
2642
|
+
return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
|
2643
|
+
}
|
2644
|
+
|
2645
|
+
#define VDR_q5_K_q8_1 2
|
2646
|
+
|
2647
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
2648
|
+
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2649
|
+
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
2650
|
+
|
2651
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2652
|
+
float sumf_d = 0.0f;
|
2653
|
+
float sumf_m = 0.0f;
|
2654
|
+
|
2655
|
+
for (int i = 0; i < QR5_K; ++i) {
|
2656
|
+
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
2657
|
+
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
2658
|
+
|
2659
|
+
const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
|
2660
|
+
const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
|
2661
|
+
|
2662
|
+
const int v0i = vl0i | vh0i;
|
2663
|
+
const int v1i = vl1i | vh1i;
|
2664
|
+
|
2665
|
+
const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
2666
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
2667
|
+
|
2668
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
2669
|
+
sumf_m += d8[i] * (dot2 * m[i]);
|
2670
|
+
|
2671
|
+
}
|
2672
|
+
|
2673
|
+
return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m;
|
2674
|
+
|
2675
|
+
#else
|
2676
|
+
return 0.0f; // only to satisfy the compiler
|
2677
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2678
|
+
}
|
2679
|
+
|
2680
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
2681
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2682
|
+
|
2683
|
+
#ifndef GGML_QKK_64
|
2684
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
2685
|
+
|
2686
|
+
int vl[2];
|
2687
|
+
int vh[2];
|
2688
|
+
int u[2*QR5_K];
|
2689
|
+
float d8[QR5_K];
|
2690
|
+
|
2691
|
+
const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
|
2692
|
+
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
|
2693
|
+
const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
|
2694
|
+
|
2695
|
+
vl[0] = ql[0];
|
2696
|
+
vl[1] = ql[4];
|
2697
|
+
|
2698
|
+
vh[0] = qh[0] >> bq8_offset;
|
2699
|
+
vh[1] = qh[4] >> bq8_offset;
|
2700
|
+
|
2701
|
+
const uint16_t * scales = (const uint16_t *)bq5_K->scales;
|
2702
|
+
uint16_t aux[2];
|
2703
|
+
const int j = bq8_offset/2;
|
2704
|
+
if (j < 2) {
|
2705
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
2706
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
2707
|
+
} else {
|
2708
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
2709
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
2710
|
+
}
|
2711
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2712
|
+
const uint8_t * m = sc + 2;
|
2713
|
+
|
2714
|
+
for (int i = 0; i < QR5_K; ++i) {
|
2715
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2716
|
+
d8[i] = bq8i->ds.x;
|
2717
|
+
|
2718
|
+
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2719
|
+
u[2*i+0] = q8[0];
|
2720
|
+
u[2*i+1] = q8[4];
|
2721
|
+
}
|
2722
|
+
|
2723
|
+
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
|
2724
|
+
|
2725
|
+
#else
|
2726
|
+
|
2727
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2728
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
2729
|
+
|
2730
|
+
const int8_t * s = bq5_K->scales;
|
2731
|
+
|
2732
|
+
const float d = bq5_K->d;
|
2733
|
+
|
2734
|
+
const float d8_1 = bq8_1[0].ds.x;
|
2735
|
+
const float d8_2 = bq8_1[1].ds.x;
|
2736
|
+
|
2737
|
+
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2738
|
+
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
2739
|
+
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
2740
|
+
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
2741
|
+
|
2742
|
+
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
2743
|
+
const int vl1 = ql[0];
|
2744
|
+
const int vl2 = ql[4];
|
2745
|
+
|
2746
|
+
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
2747
|
+
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
2748
|
+
const int in = step%8; // 0, 4, 0, 4
|
2749
|
+
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
2750
|
+
|
2751
|
+
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
2752
|
+
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
2753
|
+
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
2754
|
+
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
2755
|
+
|
2756
|
+
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
2757
|
+
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
2758
|
+
|
2759
|
+
return d * sumf_d;
|
2760
|
+
|
2761
|
+
#else
|
2762
|
+
return 0.0f; // only to satisfy the compiler
|
2763
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2764
|
+
|
2765
|
+
#endif
|
2766
|
+
}
|
2767
|
+
|
2768
|
+
static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2769
|
+
|
2770
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2771
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
|
2772
|
+
__shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
|
2773
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
2774
|
+
|
2775
|
+
*x_ql = tile_x_ql;
|
2776
|
+
*x_dm = tile_x_dm;
|
2777
|
+
*x_qh = tile_x_qh;
|
2778
|
+
*x_sc = tile_x_sc;
|
2779
|
+
}
|
2780
|
+
|
2781
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2782
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2783
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2784
|
+
|
2785
|
+
__builtin_assume(i_offset >= 0);
|
2786
|
+
__builtin_assume(i_offset < 8);
|
2787
|
+
__builtin_assume(k >= 0);
|
2788
|
+
__builtin_assume(k < WARP_SIZE);
|
2789
|
+
|
2790
|
+
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
2791
|
+
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
2792
|
+
|
2793
|
+
const block_q5_K * bx0 = (block_q5_K *) vx;
|
2794
|
+
|
2795
|
+
#pragma unroll
|
2796
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2797
|
+
int i = i0 + i_offset;
|
2798
|
+
|
2799
|
+
if (need_check) {
|
2800
|
+
i = min(i, i_max);
|
2801
|
+
}
|
2802
|
+
|
2803
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2804
|
+
|
2805
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2806
|
+
}
|
2807
|
+
|
2808
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
2809
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2810
|
+
|
2811
|
+
#pragma unroll
|
2812
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
|
2813
|
+
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2814
|
+
|
2815
|
+
if (need_check) {
|
2816
|
+
i = min(i, i_max);
|
2817
|
+
}
|
2818
|
+
|
2819
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2820
|
+
|
2821
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
#pragma unroll
|
2825
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
|
2826
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2827
|
+
|
2828
|
+
if (need_check) {
|
2829
|
+
i = min(i, i_max);
|
2830
|
+
}
|
2831
|
+
|
2832
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
|
2833
|
+
|
2834
|
+
x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
|
2835
|
+
}
|
2836
|
+
|
2837
|
+
#pragma unroll
|
2838
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2839
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
2840
|
+
|
2841
|
+
if (need_check) {
|
2842
|
+
i = min(i, i_max);
|
2843
|
+
}
|
2844
|
+
|
2845
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
2846
|
+
|
2847
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
|
2848
|
+
}
|
2849
|
+
}
|
2850
|
+
|
2851
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
2852
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2853
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2854
|
+
|
2855
|
+
__builtin_assume(i >= 0);
|
2856
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2857
|
+
__builtin_assume(j >= 0);
|
2858
|
+
__builtin_assume(j < WARP_SIZE);
|
2859
|
+
__builtin_assume(k >= 0);
|
2860
|
+
__builtin_assume(k < WARP_SIZE);
|
2861
|
+
|
2862
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2863
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2864
|
+
|
2865
|
+
int vl[2];
|
2866
|
+
int vh[2];
|
2867
|
+
int u[2*QR4_K];
|
2868
|
+
float d8[QR4_K];
|
2869
|
+
|
2870
|
+
const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
|
2871
|
+
|
2872
|
+
vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
|
2873
|
+
vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
|
2874
|
+
|
2875
|
+
vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
|
2876
|
+
vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
|
2877
|
+
|
2878
|
+
const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
|
2879
|
+
uint16_t aux[2];
|
2880
|
+
const int l = bq8_offset/2;
|
2881
|
+
if (l < 2) {
|
2882
|
+
aux[0] = scales[l+0] & 0x3f3f;
|
2883
|
+
aux[1] = scales[l+2] & 0x3f3f;
|
2884
|
+
} else {
|
2885
|
+
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2886
|
+
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2887
|
+
}
|
2888
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2889
|
+
const uint8_t * m = sc + 2;
|
2890
|
+
|
2891
|
+
for (int l = 0; l < QR5_K; ++l) {
|
2892
|
+
const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2893
|
+
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2894
|
+
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2895
|
+
d8[l] = y_ds[kqsy / QI8_1].x;
|
2896
|
+
}
|
2897
|
+
|
2898
|
+
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
|
2899
|
+
}
|
2900
|
+
|
2901
|
+
#define VDR_q6_K_q8_1 1
|
2902
|
+
|
2903
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
|
2904
|
+
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
2905
|
+
const float & d, const float * __restrict__ d8) {
|
2906
|
+
|
2907
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2908
|
+
float sumf = 0.0f;
|
2909
|
+
|
2910
|
+
for (int i = 0; i < QR6_K; ++i) {
|
2911
|
+
const int sc = scales[4*i];
|
2912
|
+
|
2913
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
2914
|
+
|
2915
|
+
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
2916
|
+
|
2917
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
2918
|
+
|
2919
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
2920
|
+
}
|
2921
|
+
|
2922
|
+
return d*sumf;
|
2923
|
+
#else
|
2924
|
+
return 0.0f; // only to satisfy the compiler
|
2925
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2926
|
+
}
|
2927
|
+
|
2928
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
2929
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2930
|
+
|
2931
|
+
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
2932
|
+
|
2933
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
2934
|
+
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
2935
|
+
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
1601
2936
|
|
1602
|
-
|
1603
|
-
|
1604
|
-
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1605
|
-
const int ui1 = q8[0];
|
1606
|
-
const int ui2 = q8[4];
|
2937
|
+
const int vl = get_int_from_uint8(bq6_K->ql, iqs);
|
2938
|
+
const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
|
1607
2939
|
|
1608
|
-
|
1609
|
-
const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
|
2940
|
+
const int8_t * scales = bq6_K->scales + scale_offset;
|
1610
2941
|
|
1611
|
-
|
1612
|
-
|
2942
|
+
int u[QR6_K];
|
2943
|
+
float d8[QR6_K];
|
1613
2944
|
|
1614
|
-
|
1615
|
-
|
2945
|
+
for (int i = 0; i < QR6_K; ++i) {
|
2946
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
2947
|
+
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
1616
2948
|
}
|
1617
2949
|
|
1618
|
-
return d
|
2950
|
+
return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8);
|
2951
|
+
}
|
1619
2952
|
|
1620
|
-
|
2953
|
+
static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1621
2954
|
|
1622
|
-
|
1623
|
-
|
2955
|
+
__shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
|
2956
|
+
__shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
|
2957
|
+
__shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
|
2958
|
+
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
1624
2959
|
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
2960
|
+
*x_ql = tile_x_ql;
|
2961
|
+
*x_dm = tile_x_dm;
|
2962
|
+
*x_qh = tile_x_qh;
|
2963
|
+
*x_sc = tile_x_sc;
|
2964
|
+
}
|
1628
2965
|
|
1629
|
-
|
1630
|
-
const
|
2966
|
+
template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
2967
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2968
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1631
2969
|
|
1632
|
-
|
1633
|
-
|
2970
|
+
__builtin_assume(i_offset >= 0);
|
2971
|
+
__builtin_assume(i_offset < 8);
|
2972
|
+
__builtin_assume(k >= 0);
|
2973
|
+
__builtin_assume(k < WARP_SIZE);
|
1634
2974
|
|
1635
|
-
const int
|
1636
|
-
const int
|
1637
|
-
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1638
|
-
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
2975
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2976
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
1639
2977
|
|
1640
|
-
const
|
1641
|
-
const int v1 = q4[0];
|
1642
|
-
const int v2 = q4[4];
|
2978
|
+
const block_q6_K * bx0 = (block_q6_K *) vx;
|
1643
2979
|
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
2980
|
+
#pragma unroll
|
2981
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
|
2982
|
+
int i = i0 + i_offset;
|
1648
2983
|
|
1649
|
-
|
1650
|
-
|
2984
|
+
if (need_check) {
|
2985
|
+
i = min(i, i_max);
|
2986
|
+
}
|
1651
2987
|
|
1652
|
-
|
2988
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
|
1653
2989
|
|
1654
|
-
|
2990
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx);
|
2991
|
+
}
|
1655
2992
|
|
1656
|
-
|
1657
|
-
|
1658
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1659
|
-
}
|
2993
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
2994
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
1660
2995
|
|
1661
|
-
|
1662
|
-
|
2996
|
+
#pragma unroll
|
2997
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
|
2998
|
+
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
1663
2999
|
|
1664
|
-
|
1665
|
-
|
3000
|
+
if (need_check) {
|
3001
|
+
i = min(i, i_max);
|
3002
|
+
}
|
1666
3003
|
|
1667
|
-
|
3004
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
1668
3005
|
|
1669
|
-
|
1670
|
-
|
1671
|
-
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
|
3006
|
+
x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d;
|
3007
|
+
}
|
1672
3008
|
|
1673
|
-
|
1674
|
-
|
3009
|
+
#pragma unroll
|
3010
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
|
3011
|
+
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
1675
3012
|
|
1676
|
-
|
1677
|
-
|
3013
|
+
if (need_check) {
|
3014
|
+
i = min(i, i_max);
|
3015
|
+
}
|
1678
3016
|
|
1679
|
-
|
1680
|
-
const int vl2 = ql[4];
|
3017
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2);
|
1681
3018
|
|
1682
|
-
|
1683
|
-
|
3019
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2));
|
3020
|
+
}
|
1684
3021
|
|
1685
|
-
|
1686
|
-
|
1687
|
-
|
1688
|
-
|
1689
|
-
|
1690
|
-
|
1691
|
-
|
1692
|
-
|
1693
|
-
|
3022
|
+
#pragma unroll
|
3023
|
+
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
3024
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
3025
|
+
|
3026
|
+
if (need_check) {
|
3027
|
+
i = min(i, i_max);
|
3028
|
+
}
|
3029
|
+
|
3030
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
|
3031
|
+
|
3032
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
|
1694
3033
|
}
|
1695
|
-
|
1696
|
-
const uint8_t * m = sc + 2;
|
3034
|
+
}
|
1697
3035
|
|
1698
|
-
|
3036
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
3037
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3038
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1699
3039
|
|
1700
|
-
|
1701
|
-
|
1702
|
-
|
1703
|
-
|
1704
|
-
|
3040
|
+
__builtin_assume(i >= 0);
|
3041
|
+
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
3042
|
+
__builtin_assume(j >= 0);
|
3043
|
+
__builtin_assume(j < WARP_SIZE);
|
3044
|
+
__builtin_assume(k >= 0);
|
3045
|
+
__builtin_assume(k < WARP_SIZE);
|
1705
3046
|
|
1706
|
-
|
1707
|
-
|
3047
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3048
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
1708
3049
|
|
1709
|
-
|
1710
|
-
|
3050
|
+
const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
|
3051
|
+
const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
|
3052
|
+
const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
|
1711
3053
|
|
1712
|
-
|
1713
|
-
const int vi2 = vil2 | vih2;
|
3054
|
+
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
|
1714
3055
|
|
1715
|
-
|
1716
|
-
|
3056
|
+
const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
|
3057
|
+
const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
|
1717
3058
|
|
1718
|
-
|
1719
|
-
|
3059
|
+
int u[QR6_K];
|
3060
|
+
float d8[QR6_K];
|
1720
3061
|
|
3062
|
+
for (int l = 0; l < QR6_K; ++l) {
|
3063
|
+
const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
|
3064
|
+
u[l] = y_qs[kqsy];
|
3065
|
+
d8[l] = y_ds[kqsy / QI8_1].x;
|
1721
3066
|
}
|
1722
3067
|
|
1723
|
-
return
|
1724
|
-
|
1725
|
-
|
3068
|
+
return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
|
3069
|
+
x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
|
3070
|
+
}
|
1726
3071
|
|
1727
|
-
|
3072
|
+
template <int qk, int qr, int qi, typename block_q_t,
|
3073
|
+
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3074
|
+
static __global__ void mul_mat_q(
|
3075
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3076
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
1728
3077
|
|
1729
|
-
const
|
3078
|
+
const block_q_t * x = (const block_q_t *) vx;
|
3079
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1730
3080
|
|
1731
|
-
const
|
1732
|
-
const
|
3081
|
+
const int blocks_per_row_x = ncols_x / qk;
|
3082
|
+
const int blocks_per_col_y = nrows_y / QK8_1;
|
3083
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
1733
3084
|
|
1734
|
-
const int
|
1735
|
-
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
1736
|
-
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1737
|
-
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
3085
|
+
const int & ncols_dst = ncols_y;
|
1738
3086
|
|
1739
|
-
const int
|
1740
|
-
const int
|
1741
|
-
const int vl2 = ql[4];
|
3087
|
+
const int tid_x = threadIdx.x;
|
3088
|
+
const int tid_y = threadIdx.y;
|
1742
3089
|
|
1743
|
-
const int
|
1744
|
-
const int
|
1745
|
-
const int
|
1746
|
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
3090
|
+
const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
|
3091
|
+
const int & row_x_0 = row_dst_0;
|
3092
|
+
const int row_dst = row_dst_0 + tid_x;
|
1747
3093
|
|
1748
|
-
const int
|
1749
|
-
const int
|
1750
|
-
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
1751
|
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
3094
|
+
const int col_dst_0 = blockIdx.y*WARP_SIZE;
|
3095
|
+
const int & col_y_0 = col_dst_0;
|
1752
3096
|
|
1753
|
-
|
1754
|
-
|
3097
|
+
int * tile_x_ql = nullptr;
|
3098
|
+
half2 * tile_x_dm = nullptr;
|
3099
|
+
int * tile_x_qh = nullptr;
|
3100
|
+
int * tile_x_sc = nullptr;
|
1755
3101
|
|
1756
|
-
|
3102
|
+
allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
|
1757
3103
|
|
1758
|
-
|
3104
|
+
const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
|
1759
3105
|
|
1760
|
-
|
1761
|
-
|
1762
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1763
|
-
}
|
3106
|
+
__shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
|
3107
|
+
__shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
|
1764
3108
|
|
1765
|
-
|
1766
|
-
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
3109
|
+
float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
|
1767
3110
|
|
1768
|
-
|
1769
|
-
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
3111
|
+
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
1770
3112
|
|
1771
|
-
|
1772
|
-
|
1773
|
-
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
3113
|
+
load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
|
3114
|
+
tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
|
1774
3115
|
|
1775
|
-
|
3116
|
+
for (int ir = 0; ir < qr; ++ir) {
|
3117
|
+
const int kqs = ir*WARP_SIZE + tid_x;
|
3118
|
+
const int kbxd = kqs / QI8_1;
|
1776
3119
|
|
1777
|
-
|
3120
|
+
for (int i = 0; i < WARP_SIZE; i += 8) {
|
3121
|
+
const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
|
1778
3122
|
|
1779
|
-
|
1780
|
-
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
3123
|
+
const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
|
1781
3124
|
|
1782
|
-
|
1783
|
-
|
3125
|
+
tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
|
3126
|
+
}
|
3127
|
+
}
|
1784
3128
|
|
1785
|
-
|
1786
|
-
|
3129
|
+
for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
|
3130
|
+
const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
|
3131
|
+
const int kby = tid_x % blocks_per_tile_y_col;
|
3132
|
+
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
3133
|
+
tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
|
3134
|
+
}
|
1787
3135
|
|
1788
|
-
|
1789
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
1790
|
-
const float d8i = bq8i->d;
|
3136
|
+
__syncthreads();
|
1791
3137
|
|
1792
|
-
|
3138
|
+
#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
|
3139
|
+
#pragma unroll
|
3140
|
+
#endif // __CUDA_ARCH__ >= 700
|
3141
|
+
for (int k = 0; k < WARP_SIZE; k += vdr) {
|
3142
|
+
#pragma unroll
|
3143
|
+
for (int j = 0; j < WARP_SIZE; j += 8) {
|
3144
|
+
#pragma unroll
|
3145
|
+
for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
|
3146
|
+
sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
|
3147
|
+
tid_x + i, tid_y + j, k);
|
3148
|
+
}
|
3149
|
+
}
|
3150
|
+
}
|
1793
3151
|
|
1794
|
-
|
3152
|
+
__syncthreads();
|
3153
|
+
}
|
1795
3154
|
|
1796
|
-
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1797
3155
|
|
1798
|
-
|
3156
|
+
if (row_dst >= nrows_dst) {
|
3157
|
+
return;
|
1799
3158
|
}
|
1800
3159
|
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
3160
|
+
for (int j = 0; j < WARP_SIZE; j += 8) {
|
3161
|
+
const int col_dst = col_dst_0 + j + tid_y;
|
3162
|
+
|
3163
|
+
if (col_dst >= ncols_dst) {
|
3164
|
+
return;
|
3165
|
+
}
|
3166
|
+
|
3167
|
+
for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
|
3168
|
+
dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
|
3169
|
+
}
|
3170
|
+
}
|
1805
3171
|
}
|
1806
3172
|
|
1807
|
-
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
3173
|
+
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1808
3174
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
1809
3175
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1810
3176
|
|
@@ -1813,7 +3179,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1813
3179
|
}
|
1814
3180
|
|
1815
3181
|
const int blocks_per_row = ncols / qk;
|
1816
|
-
const int blocks_per_warp = WARP_SIZE / qi;
|
3182
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
1817
3183
|
|
1818
3184
|
// partial sum for each thread
|
1819
3185
|
float tmp = 0.0f;
|
@@ -1822,11 +3188,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1822
3188
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1823
3189
|
|
1824
3190
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1825
|
-
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
3191
|
+
const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
|
1826
3192
|
|
1827
|
-
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
3193
|
+
const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
|
1828
3194
|
|
1829
|
-
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
3195
|
+
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
|
1830
3196
|
|
1831
3197
|
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
1832
3198
|
}
|
@@ -1859,11 +3225,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1859
3225
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
1860
3226
|
|
1861
3227
|
// partial sum for each thread
|
1862
|
-
#ifdef
|
3228
|
+
#ifdef GGML_CUDA_F16
|
1863
3229
|
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
1864
3230
|
#else
|
1865
3231
|
float tmp = 0.0f;
|
1866
|
-
#endif //
|
3232
|
+
#endif // GGML_CUDA_F16
|
1867
3233
|
|
1868
3234
|
for (int i = 0; i < ncols; i += iter_stride) {
|
1869
3235
|
const int col = i + vals_per_iter*tid;
|
@@ -1883,7 +3249,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1883
3249
|
|
1884
3250
|
// matrix multiplication
|
1885
3251
|
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
1886
|
-
#ifdef
|
3252
|
+
#ifdef GGML_CUDA_F16
|
1887
3253
|
tmp += __hmul2(v, {
|
1888
3254
|
y[iybs + iqs + j/qr + 0],
|
1889
3255
|
y[iybs + iqs + j/qr + y_offset]
|
@@ -1891,7 +3257,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1891
3257
|
#else
|
1892
3258
|
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
1893
3259
|
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
1894
|
-
#endif //
|
3260
|
+
#endif // GGML_CUDA_F16
|
1895
3261
|
}
|
1896
3262
|
}
|
1897
3263
|
|
@@ -1902,11 +3268,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1902
3268
|
}
|
1903
3269
|
|
1904
3270
|
if (tid == 0) {
|
1905
|
-
#ifdef
|
3271
|
+
#ifdef GGML_CUDA_F16
|
1906
3272
|
dst[row] = tmp.x + tmp.y;
|
1907
3273
|
#else
|
1908
3274
|
dst[row] = tmp;
|
1909
|
-
#endif //
|
3275
|
+
#endif // GGML_CUDA_F16
|
1910
3276
|
}
|
1911
3277
|
}
|
1912
3278
|
|
@@ -2046,7 +3412,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
2046
3412
|
}
|
2047
3413
|
|
2048
3414
|
// rope == RoPE == rotary positional embedding
|
2049
|
-
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float
|
3415
|
+
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
|
3416
|
+
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
2050
3417
|
const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
2051
3418
|
|
2052
3419
|
if (col >= ncols) {
|
@@ -2056,7 +3423,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
2056
3423
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
2057
3424
|
const int i = row*ncols + col;
|
2058
3425
|
|
2059
|
-
const float theta =
|
3426
|
+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
2060
3427
|
const float sin_theta = sinf(theta);
|
2061
3428
|
const float cos_theta = cosf(theta);
|
2062
3429
|
|
@@ -2203,9 +3570,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
2203
3570
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
2204
3571
|
}
|
2205
3572
|
|
2206
|
-
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int
|
2207
|
-
const int
|
2208
|
-
|
3573
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
3574
|
+
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
3575
|
+
const dim3 num_blocks(block_num_x, ky, 1);
|
3576
|
+
const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
|
3577
|
+
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
2209
3578
|
}
|
2210
3579
|
|
2211
3580
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -2366,7 +3735,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2366
3735
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2367
3736
|
const dim3 block_nums(1, block_num_y, 1);
|
2368
3737
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2369
|
-
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
3738
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
2370
3739
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2371
3740
|
}
|
2372
3741
|
|
@@ -2375,7 +3744,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2375
3744
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2376
3745
|
const dim3 block_nums(1, block_num_y, 1);
|
2377
3746
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2378
|
-
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
3747
|
+
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
2379
3748
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2380
3749
|
}
|
2381
3750
|
|
@@ -2384,7 +3753,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2384
3753
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2385
3754
|
const dim3 block_nums(1, block_num_y, 1);
|
2386
3755
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2387
|
-
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
3756
|
+
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
2388
3757
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2389
3758
|
}
|
2390
3759
|
|
@@ -2393,7 +3762,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2393
3762
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2394
3763
|
const dim3 block_nums(1, block_num_y, 1);
|
2395
3764
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2396
|
-
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
3765
|
+
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
2397
3766
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2398
3767
|
}
|
2399
3768
|
|
@@ -2402,7 +3771,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2402
3771
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2403
3772
|
const dim3 block_nums(1, block_num_y, 1);
|
2404
3773
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2405
|
-
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
3774
|
+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
2406
3775
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2407
3776
|
}
|
2408
3777
|
|
@@ -2411,7 +3780,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2411
3780
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2412
3781
|
const dim3 block_nums(1, block_num_y, 1);
|
2413
3782
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2414
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
3783
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1>
|
2415
3784
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2416
3785
|
}
|
2417
3786
|
|
@@ -2420,7 +3789,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2420
3789
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2421
3790
|
const dim3 block_nums(1, block_num_y, 1);
|
2422
3791
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2423
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
3792
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1>
|
2424
3793
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2425
3794
|
}
|
2426
3795
|
|
@@ -2429,10 +3798,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2429
3798
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2430
3799
|
const dim3 block_nums(1, block_num_y, 1);
|
2431
3800
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2432
|
-
|
2433
|
-
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2434
|
-
// is better amortized.
|
2435
|
-
mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
|
3801
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1>
|
2436
3802
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2437
3803
|
}
|
2438
3804
|
|
@@ -2441,10 +3807,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2441
3807
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2442
3808
|
const dim3 block_nums(1, block_num_y, 1);
|
2443
3809
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2444
|
-
|
2445
|
-
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2446
|
-
// is better amortized.
|
2447
|
-
mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
|
3810
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1>
|
2448
3811
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2449
3812
|
}
|
2450
3813
|
|
@@ -2453,7 +3816,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2453
3816
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2454
3817
|
const dim3 block_nums(1, block_num_y, 1);
|
2455
3818
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2456
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
3819
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1>
|
2457
3820
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2458
3821
|
}
|
2459
3822
|
|
@@ -2500,6 +3863,186 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
2500
3863
|
}
|
2501
3864
|
}
|
2502
3865
|
|
3866
|
+
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
3867
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3868
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3869
|
+
|
3870
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3871
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3872
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3873
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3874
|
+
|
3875
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3876
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3877
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3878
|
+
} else {
|
3879
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3880
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3881
|
+
}
|
3882
|
+
}
|
3883
|
+
|
3884
|
+
static void ggml_mul_mat_q4_1_q8_1_cuda(
|
3885
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3886
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3887
|
+
|
3888
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3889
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3890
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3891
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3892
|
+
|
3893
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3894
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3895
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3896
|
+
} else {
|
3897
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3898
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3899
|
+
}
|
3900
|
+
}
|
3901
|
+
|
3902
|
+
static void ggml_mul_mat_q5_0_q8_1_cuda(
|
3903
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3904
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3905
|
+
|
3906
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3907
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3908
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3909
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3910
|
+
|
3911
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3912
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3913
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3914
|
+
} else {
|
3915
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3916
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3917
|
+
}
|
3918
|
+
}
|
3919
|
+
|
3920
|
+
static void ggml_mul_mat_q5_1_q8_1_cuda(
|
3921
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3922
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3923
|
+
|
3924
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3925
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3926
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3927
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3928
|
+
|
3929
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3930
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3931
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3932
|
+
} else {
|
3933
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3934
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3935
|
+
}
|
3936
|
+
}
|
3937
|
+
|
3938
|
+
static void ggml_mul_mat_q8_0_q8_1_cuda(
|
3939
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3940
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3941
|
+
|
3942
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3943
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3944
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3945
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3946
|
+
|
3947
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3948
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3949
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3950
|
+
} else {
|
3951
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3952
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3953
|
+
}
|
3954
|
+
}
|
3955
|
+
|
3956
|
+
static void ggml_mul_mat_q2_K_q8_1_cuda(
|
3957
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3958
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3959
|
+
|
3960
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3961
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3962
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3963
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3964
|
+
|
3965
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3966
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
|
3967
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3968
|
+
} else {
|
3969
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
|
3970
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3971
|
+
}
|
3972
|
+
}
|
3973
|
+
|
3974
|
+
static void ggml_mul_mat_q3_K_q8_1_cuda(
|
3975
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3976
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3977
|
+
|
3978
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3979
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3980
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3981
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
3982
|
+
|
3983
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
3984
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
|
3985
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
|
+
} else {
|
3987
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
|
3988
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3989
|
+
}
|
3990
|
+
}
|
3991
|
+
|
3992
|
+
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
3993
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3994
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3995
|
+
|
3996
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
3997
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
3998
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3999
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
4000
|
+
|
4001
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
4002
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
|
4003
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4004
|
+
} else {
|
4005
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
|
4006
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4007
|
+
}
|
4008
|
+
}
|
4009
|
+
|
4010
|
+
static void ggml_mul_mat_q5_K_q8_1_cuda(
|
4011
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4012
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4013
|
+
|
4014
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
4015
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
4016
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4017
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
4018
|
+
|
4019
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
4020
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
|
4021
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4022
|
+
} else {
|
4023
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
|
4024
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4025
|
+
}
|
4026
|
+
}
|
4027
|
+
|
4028
|
+
static void ggml_mul_mat_q6_K_q8_1_cuda(
|
4029
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4030
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4031
|
+
|
4032
|
+
const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
|
4033
|
+
const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
|
4034
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4035
|
+
const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
|
4036
|
+
|
4037
|
+
if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
|
4038
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
|
4039
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4040
|
+
} else {
|
4041
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
|
4042
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4043
|
+
}
|
4044
|
+
}
|
4045
|
+
|
2503
4046
|
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2504
4047
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
2505
4048
|
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
@@ -2544,12 +4087,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
2544
4087
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
2545
4088
|
}
|
2546
4089
|
|
2547
|
-
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
4090
|
+
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4091
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
2548
4092
|
GGML_ASSERT(nrows % 2 == 0);
|
2549
4093
|
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
2550
4094
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
2551
4095
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
2552
|
-
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
4096
|
+
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
2553
4097
|
}
|
2554
4098
|
|
2555
4099
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
@@ -2676,10 +4220,9 @@ static size_t g_scratch_offset = 0;
|
|
2676
4220
|
|
2677
4221
|
static int g_device_count = -1;
|
2678
4222
|
static int g_main_device = 0;
|
2679
|
-
#ifndef GGML_CUDA_FORCE_DMMV
|
2680
4223
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
2681
|
-
#endif
|
2682
4224
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
4225
|
+
static bool g_mul_mat_q = false;
|
2683
4226
|
|
2684
4227
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
2685
4228
|
|
@@ -2701,9 +4244,7 @@ void ggml_init_cublas() {
|
|
2701
4244
|
g_tensor_split[id] = total_vram;
|
2702
4245
|
total_vram += prop.totalGlobalMem;
|
2703
4246
|
|
2704
|
-
#ifndef GGML_CUDA_FORCE_DMMV
|
2705
4247
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
2706
|
-
#endif
|
2707
4248
|
}
|
2708
4249
|
for (int id = 0; id < g_device_count; ++id) {
|
2709
4250
|
g_tensor_split[id] /= total_vram;
|
@@ -2965,6 +4506,83 @@ inline void ggml_cuda_op_rms_norm(
|
|
2965
4506
|
(void) i1;
|
2966
4507
|
}
|
2967
4508
|
|
4509
|
+
inline void ggml_cuda_op_mul_mat_q(
|
4510
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
4511
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
4512
|
+
cudaStream_t & cudaStream_main){
|
4513
|
+
|
4514
|
+
GGML_ASSERT(src0_ddq_i != nullptr);
|
4515
|
+
GGML_ASSERT(src1_ddf_i != nullptr);
|
4516
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
4517
|
+
|
4518
|
+
const int64_t ne00 = src0->ne[0];
|
4519
|
+
|
4520
|
+
const int64_t ne10 = src1->ne[0];
|
4521
|
+
const int64_t ne11 = src1->ne[1];
|
4522
|
+
GGML_ASSERT(ne10 % QK8_1 == 0);
|
4523
|
+
|
4524
|
+
const int64_t ne0 = dst->ne[0];
|
4525
|
+
|
4526
|
+
const int64_t i01_diff = i01_high - i01_low;
|
4527
|
+
|
4528
|
+
int id;
|
4529
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4530
|
+
|
4531
|
+
// the main device has a larger memory buffer to hold the results from all GPUs
|
4532
|
+
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
4533
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
|
4534
|
+
|
4535
|
+
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
4536
|
+
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
4537
|
+
size_t as;
|
4538
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
|
4539
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
|
4540
|
+
|
4541
|
+
switch (src0->type) {
|
4542
|
+
case GGML_TYPE_Q4_0:
|
4543
|
+
ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4544
|
+
break;
|
4545
|
+
case GGML_TYPE_Q4_1:
|
4546
|
+
ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4547
|
+
break;
|
4548
|
+
case GGML_TYPE_Q5_0:
|
4549
|
+
ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4550
|
+
break;
|
4551
|
+
case GGML_TYPE_Q5_1:
|
4552
|
+
ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4553
|
+
break;
|
4554
|
+
case GGML_TYPE_Q8_0:
|
4555
|
+
ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4556
|
+
break;
|
4557
|
+
case GGML_TYPE_Q2_K:
|
4558
|
+
ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4559
|
+
break;
|
4560
|
+
case GGML_TYPE_Q3_K:
|
4561
|
+
ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4562
|
+
break;
|
4563
|
+
case GGML_TYPE_Q4_K:
|
4564
|
+
ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4565
|
+
break;
|
4566
|
+
case GGML_TYPE_Q5_K:
|
4567
|
+
ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4568
|
+
break;
|
4569
|
+
case GGML_TYPE_Q6_K:
|
4570
|
+
ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4571
|
+
break;
|
4572
|
+
default:
|
4573
|
+
GGML_ASSERT(false);
|
4574
|
+
break;
|
4575
|
+
}
|
4576
|
+
|
4577
|
+
ggml_cuda_pool_free(src1_q8_1, as);
|
4578
|
+
|
4579
|
+
(void) src1;
|
4580
|
+
(void) dst;
|
4581
|
+
(void) src0_ddf_i;
|
4582
|
+
(void) i02;
|
4583
|
+
(void) i1;
|
4584
|
+
}
|
4585
|
+
|
2968
4586
|
inline void ggml_cuda_op_mul_mat_vec(
|
2969
4587
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2970
4588
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2979,6 +4597,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2979
4597
|
|
2980
4598
|
#ifdef GGML_CUDA_FORCE_DMMV
|
2981
4599
|
const bool use_mul_mat_vec_q = false;
|
4600
|
+
(void) g_compute_capabilities[0];
|
2982
4601
|
#else
|
2983
4602
|
int id;
|
2984
4603
|
CUDA_CHECK(cudaGetDevice(&id));
|
@@ -3006,7 +4625,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3006
4625
|
ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
3007
4626
|
size_t as;
|
3008
4627
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
3009
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
4628
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
|
3010
4629
|
|
3011
4630
|
switch (src0->type) {
|
3012
4631
|
case GGML_TYPE_Q4_0:
|
@@ -3047,7 +4666,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3047
4666
|
ggml_cuda_pool_free(src1_q8_1, as);
|
3048
4667
|
} else {
|
3049
4668
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
3050
|
-
#ifdef
|
4669
|
+
#ifdef GGML_CUDA_F16
|
3051
4670
|
size_t ash;
|
3052
4671
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
3053
4672
|
|
@@ -3063,7 +4682,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3063
4682
|
}
|
3064
4683
|
#else
|
3065
4684
|
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
3066
|
-
#endif //
|
4685
|
+
#endif // GGML_CUDA_F16
|
3067
4686
|
|
3068
4687
|
switch (src0->type) {
|
3069
4688
|
case GGML_TYPE_Q4_0:
|
@@ -3104,11 +4723,11 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3104
4723
|
break;
|
3105
4724
|
}
|
3106
4725
|
|
3107
|
-
#ifdef
|
4726
|
+
#ifdef GGML_CUDA_F16
|
3108
4727
|
if (src1_convert_f16) {
|
3109
4728
|
ggml_cuda_pool_free(src1_dfloat, ash);
|
3110
4729
|
}
|
3111
|
-
#endif //
|
4730
|
+
#endif // GGML_CUDA_F16
|
3112
4731
|
}
|
3113
4732
|
|
3114
4733
|
(void) src1;
|
@@ -3168,6 +4787,7 @@ inline void ggml_cuda_op_rope(
|
|
3168
4787
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
3169
4788
|
|
3170
4789
|
const int64_t ne00 = src0->ne[0];
|
4790
|
+
const int64_t ne01 = src0->ne[1];
|
3171
4791
|
const int64_t i01_diff = i01_high - i01_low;
|
3172
4792
|
|
3173
4793
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
@@ -3181,17 +4801,18 @@ inline void ggml_cuda_op_rope(
|
|
3181
4801
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
3182
4802
|
|
3183
4803
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
3184
|
-
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
3185
4804
|
|
3186
|
-
bool is_glm = mode & 4;
|
4805
|
+
const bool is_glm = mode & 4;
|
3187
4806
|
|
3188
4807
|
// compute
|
3189
4808
|
if (is_glm) {
|
4809
|
+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
3190
4810
|
const float id_p = min(p, n_ctx - 2.f);
|
3191
4811
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
3192
4812
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
3193
4813
|
} else {
|
3194
|
-
|
4814
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
4815
|
+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
3195
4816
|
}
|
3196
4817
|
|
3197
4818
|
(void) src1;
|
@@ -3363,7 +4984,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3363
4984
|
int64_t row_low, row_high;
|
3364
4985
|
if (split) {
|
3365
4986
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
3366
|
-
|
4987
|
+
row_low -= row_low % GGML_CUDA_MMQ_Y;
|
4988
|
+
|
4989
|
+
if (id == g_device_count - 1) {
|
4990
|
+
row_high = nrows0;
|
4991
|
+
} else {
|
4992
|
+
row_high = nrows0*g_tensor_split[id + 1];
|
4993
|
+
row_high -= row_high % GGML_CUDA_MMQ_Y;
|
4994
|
+
}
|
3367
4995
|
} else {
|
3368
4996
|
row_low = 0;
|
3369
4997
|
row_high = nrows0*i02_divisor;
|
@@ -3529,13 +5157,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3529
5157
|
if (split) {
|
3530
5158
|
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
3531
5159
|
// dst is NOT transposed.
|
3532
|
-
// The outputs of
|
5160
|
+
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
3533
5161
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
3534
5162
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
3535
|
-
|
3536
|
-
|
3537
|
-
|
3538
|
-
}
|
5163
|
+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
|
5164
|
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
|
5165
|
+
i01_diff*sizeof(float), ne1, kind, cudaStream_main));
|
3539
5166
|
} else {
|
3540
5167
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
3541
5168
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
@@ -3718,7 +5345,18 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
3718
5345
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
3719
5346
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
3720
5347
|
} else {
|
3721
|
-
|
5348
|
+
int min_compute_capability = INT_MAX;
|
5349
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5350
|
+
if (min_compute_capability > g_compute_capabilities[id]) {
|
5351
|
+
min_compute_capability = g_compute_capabilities[id];
|
5352
|
+
}
|
5353
|
+
}
|
5354
|
+
|
5355
|
+
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
5356
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
|
5357
|
+
} else {
|
5358
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
5359
|
+
}
|
3722
5360
|
}
|
3723
5361
|
} else {
|
3724
5362
|
GGML_ASSERT(false);
|
@@ -3795,7 +5433,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
|
3795
5433
|
|
3796
5434
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3797
5435
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3798
|
-
|
5436
|
+
|
5437
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
5438
|
+
const bool is_glm = mode & 4;
|
5439
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
3799
5440
|
}
|
3800
5441
|
|
3801
5442
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3828,7 +5469,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3828
5469
|
row_high = nrows;
|
3829
5470
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
3830
5471
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
3831
|
-
|
5472
|
+
row_low -= row_low % GGML_CUDA_MMQ_Y;
|
5473
|
+
|
5474
|
+
if (id == g_device_count - 1) {
|
5475
|
+
row_high = nrows;
|
5476
|
+
} else {
|
5477
|
+
row_high = nrows*g_tensor_split[id + 1];
|
5478
|
+
row_high -= row_high % GGML_CUDA_MMQ_Y;
|
5479
|
+
}
|
3832
5480
|
} else {
|
3833
5481
|
GGML_ASSERT(false);
|
3834
5482
|
}
|
@@ -4002,6 +5650,10 @@ void ggml_cuda_set_main_device(int main_device) {
|
|
4002
5650
|
}
|
4003
5651
|
}
|
4004
5652
|
|
5653
|
+
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
|
5654
|
+
g_mul_mat_q = mul_mat_q;
|
5655
|
+
}
|
5656
|
+
|
4005
5657
|
void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
4006
5658
|
g_scratch_size = scratch_size;
|
4007
5659
|
}
|