llama_cpp 0.3.5 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +549 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2526 -430
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +56 -34
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +445 -176
- data/ext/llama_cpp/src/ggml.h +125 -33
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +409 -210
- data/ext/llama_cpp/src/llama.h +19 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
@@ -14,6 +14,7 @@
|
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
16
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
#define CC_TURING 700
|
17
18
|
|
18
19
|
#if defined(_MSC_VER)
|
19
20
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -52,13 +53,41 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
52
53
|
} while (0)
|
53
54
|
#endif // CUDART_VERSION >= 11
|
54
55
|
|
55
|
-
#ifdef
|
56
|
+
#ifdef GGML_CUDA_F16
|
56
57
|
typedef half dfloat; // dequantize float
|
57
58
|
typedef half2 dfloat2;
|
58
59
|
#else
|
59
60
|
typedef float dfloat; // dequantize float
|
60
61
|
typedef float2 dfloat2;
|
61
|
-
#endif //
|
62
|
+
#endif //GGML_CUDA_F16
|
63
|
+
|
64
|
+
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
65
|
+
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
66
|
+
|
67
|
+
int x32 = 0;
|
68
|
+
x32 |= x16[0] << 0;
|
69
|
+
x32 |= x16[1] << 16;
|
70
|
+
|
71
|
+
return x32;
|
72
|
+
}
|
73
|
+
|
74
|
+
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
75
|
+
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
76
|
+
|
77
|
+
int x32 = 0;
|
78
|
+
x32 |= x16[0] << 0;
|
79
|
+
x32 |= x16[1] << 16;
|
80
|
+
|
81
|
+
return x32;
|
82
|
+
}
|
83
|
+
|
84
|
+
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
85
|
+
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
86
|
+
}
|
87
|
+
|
88
|
+
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
89
|
+
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
90
|
+
}
|
62
91
|
|
63
92
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
64
93
|
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
@@ -87,8 +116,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
|
87
116
|
#define QR4_1 2
|
88
117
|
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
89
118
|
typedef struct {
|
90
|
-
|
91
|
-
half m; // min
|
119
|
+
half2 dm; // dm.x = delta, dm.y = min
|
92
120
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
93
121
|
} block_q4_1;
|
94
122
|
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
@@ -107,8 +135,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
|
107
135
|
#define QR5_1 2
|
108
136
|
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
109
137
|
typedef struct {
|
110
|
-
|
111
|
-
half m; // min
|
138
|
+
half2 dm; // dm.x = delta, dm.y = min
|
112
139
|
uint8_t qh[4]; // 5-th bit of quants
|
113
140
|
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
114
141
|
} block_q5_1;
|
@@ -127,13 +154,19 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
|
127
154
|
#define QR8_1 1
|
128
155
|
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
129
156
|
typedef struct {
|
130
|
-
|
131
|
-
half s; // unquantized sum
|
157
|
+
half2 ds; // ds.x = delta, ds.y = sum
|
132
158
|
int8_t qs[QK8_0]; // quants
|
133
159
|
} block_q8_1;
|
134
160
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
135
161
|
|
136
|
-
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
162
|
+
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
163
|
+
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
|
164
|
+
typedef void (*load_tiles_cuda_t)(
|
165
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
166
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
|
167
|
+
typedef float (*vec_dot_q_mul_mat_cuda_t)(
|
168
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
169
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
|
137
170
|
|
138
171
|
//================================= k-quants
|
139
172
|
|
@@ -150,8 +183,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
|
150
183
|
typedef struct {
|
151
184
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
152
185
|
uint8_t qs[QK_K/4]; // quants
|
153
|
-
|
154
|
-
half dmin; // super-block scale for quantized mins
|
186
|
+
half2 dm; // super-block scale for quantized scales/mins
|
155
187
|
} block_q2_K;
|
156
188
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
157
189
|
|
@@ -180,8 +212,7 @@ typedef struct {
|
|
180
212
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
181
213
|
#else
|
182
214
|
typedef struct {
|
183
|
-
|
184
|
-
half dmin; // super-block scale for quantized mins
|
215
|
+
half2 dm; // super-block scale for quantized scales/mins
|
185
216
|
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
186
217
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
187
218
|
} block_q4_K;
|
@@ -200,11 +231,10 @@ typedef struct {
|
|
200
231
|
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
201
232
|
#else
|
202
233
|
typedef struct {
|
203
|
-
|
204
|
-
|
205
|
-
uint8_t
|
206
|
-
uint8_t
|
207
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
234
|
+
half2 dm; // super-block scale for quantized scales/mins
|
235
|
+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
236
|
+
uint8_t qh[QK_K/8]; // quants, high bit
|
237
|
+
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
208
238
|
} block_q5_K;
|
209
239
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
210
240
|
#endif
|
@@ -252,6 +282,20 @@ struct ggml_tensor_extra_gpu {
|
|
252
282
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
253
283
|
};
|
254
284
|
|
285
|
+
static int g_device_count = -1;
|
286
|
+
static int g_main_device = 0;
|
287
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
|
+
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
+
static bool g_mul_mat_q = false;
|
290
|
+
|
291
|
+
static void * g_scratch_buffer = nullptr;
|
292
|
+
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
293
|
+
static size_t g_scratch_offset = 0;
|
294
|
+
|
295
|
+
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
296
|
+
|
297
|
+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
298
|
+
|
255
299
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
256
300
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
257
301
|
|
@@ -367,33 +411,33 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
367
411
|
v.x = vui & 0xF;
|
368
412
|
v.y = vui >> 4;
|
369
413
|
|
370
|
-
#ifdef
|
414
|
+
#ifdef GGML_CUDA_F16
|
371
415
|
v = __hsub2(v, {8.0f, 8.0f});
|
372
416
|
v = __hmul2(v, {d, d});
|
373
417
|
#else
|
374
418
|
v.x = (v.x - 8.0f) * d;
|
375
419
|
v.y = (v.y - 8.0f) * d;
|
376
|
-
#endif //
|
420
|
+
#endif // GGML_CUDA_F16
|
377
421
|
}
|
378
422
|
|
379
423
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
380
424
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
381
425
|
|
382
|
-
const dfloat d = x[ib].
|
383
|
-
const dfloat m = x[ib].
|
426
|
+
const dfloat d = x[ib].dm.x;
|
427
|
+
const dfloat m = x[ib].dm.y;
|
384
428
|
|
385
429
|
const int vui = x[ib].qs[iqs];
|
386
430
|
|
387
431
|
v.x = vui & 0xF;
|
388
432
|
v.y = vui >> 4;
|
389
433
|
|
390
|
-
#ifdef
|
434
|
+
#ifdef GGML_CUDA_F16
|
391
435
|
v = __hmul2(v, {d, d});
|
392
436
|
v = __hadd2(v, {m, m});
|
393
437
|
#else
|
394
438
|
v.x = (v.x * d) + m;
|
395
439
|
v.y = (v.y * d) + m;
|
396
|
-
#endif //
|
440
|
+
#endif // GGML_CUDA_F16
|
397
441
|
}
|
398
442
|
|
399
443
|
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
@@ -410,20 +454,20 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
410
454
|
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
411
455
|
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
412
456
|
|
413
|
-
#ifdef
|
457
|
+
#ifdef GGML_CUDA_F16
|
414
458
|
v = __hsub2(v, {16.0f, 16.0f});
|
415
459
|
v = __hmul2(v, {d, d});
|
416
460
|
#else
|
417
461
|
v.x = (v.x - 16.0f) * d;
|
418
462
|
v.y = (v.y - 16.0f) * d;
|
419
|
-
#endif //
|
463
|
+
#endif // GGML_CUDA_F16
|
420
464
|
}
|
421
465
|
|
422
466
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
423
467
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
424
468
|
|
425
|
-
const dfloat d = x[ib].
|
426
|
-
const dfloat m = x[ib].
|
469
|
+
const dfloat d = x[ib].dm.x;
|
470
|
+
const dfloat m = x[ib].dm.y;
|
427
471
|
|
428
472
|
uint32_t qh;
|
429
473
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
@@ -434,13 +478,13 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
|
434
478
|
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
435
479
|
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
436
480
|
|
437
|
-
#ifdef
|
481
|
+
#ifdef GGML_CUDA_F16
|
438
482
|
v = __hmul2(v, {d, d});
|
439
483
|
v = __hadd2(v, {m, m});
|
440
484
|
#else
|
441
485
|
v.x = (v.x * d) + m;
|
442
486
|
v.y = (v.y * d) + m;
|
443
|
-
#endif //
|
487
|
+
#endif // GGML_CUDA_F16
|
444
488
|
}
|
445
489
|
|
446
490
|
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
@@ -451,12 +495,12 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
451
495
|
v.x = x[ib].qs[iqs + 0];
|
452
496
|
v.y = x[ib].qs[iqs + 1];
|
453
497
|
|
454
|
-
#ifdef
|
498
|
+
#ifdef GGML_CUDA_F16
|
455
499
|
v = __hmul2(v, {d, d});
|
456
500
|
#else
|
457
501
|
v.x *= d;
|
458
502
|
v.y *= d;
|
459
|
-
#endif //
|
503
|
+
#endif // GGML_CUDA_F16
|
460
504
|
}
|
461
505
|
|
462
506
|
//================================== k-quants
|
@@ -475,8 +519,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
475
519
|
const uint8_t q = x[i].qs[32*n + l];
|
476
520
|
float * y = yy + i*QK_K + 128*n;
|
477
521
|
|
478
|
-
float dall = x[i].
|
479
|
-
float dmin = x[i].
|
522
|
+
float dall = x[i].dm.x;
|
523
|
+
float dmin = x[i].dm.y;
|
480
524
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
481
525
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
482
526
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
@@ -486,8 +530,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
486
530
|
const int il = tid%16; // 0...15
|
487
531
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
488
532
|
float * y = yy + i*QK_K + 16*is + il;
|
489
|
-
float dall = x[i].
|
490
|
-
float dmin = x[i].
|
533
|
+
float dall = x[i].dm.x;
|
534
|
+
float dmin = x[i].dm.y;
|
491
535
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
492
536
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
493
537
|
#endif
|
@@ -573,8 +617,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
573
617
|
|
574
618
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
575
619
|
|
576
|
-
const float dall = x[i].
|
577
|
-
const float dmin = x[i].
|
620
|
+
const float dall = x[i].dm.x;
|
621
|
+
const float dmin = x[i].dm.y;
|
578
622
|
|
579
623
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
580
624
|
|
@@ -612,8 +656,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
612
656
|
|
613
657
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
614
658
|
|
615
|
-
const float dall = x[i].
|
616
|
-
const float dmin = x[i].
|
659
|
+
const float dall = x[i].dm.x;
|
660
|
+
const float dmin = x[i].dm.y;
|
617
661
|
|
618
662
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
619
663
|
const uint8_t * qh = x[i].qh + 2*ir;
|
@@ -725,8 +769,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
725
769
|
const float * y = yy + i * QK_K + y_offset;
|
726
770
|
const uint8_t * q = x[i].qs + q_offset;
|
727
771
|
|
728
|
-
const float dall = x[i].
|
729
|
-
const float dmin = x[i].
|
772
|
+
const float dall = x[i].dm.x;
|
773
|
+
const float dmin = x[i].dm.y;
|
730
774
|
|
731
775
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
732
776
|
aux[0] = a[0] & 0x0f0f0f0f;
|
@@ -768,9 +812,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
768
812
|
uaux[0] = s[0] & 0x0f0f0f0f;
|
769
813
|
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
770
814
|
|
771
|
-
const
|
772
|
-
|
773
|
-
const float2 dall = __half22float2(dh[0]);
|
815
|
+
const float2 dall = __half22float2(x[i].dm);
|
774
816
|
|
775
817
|
float sum1 = 0, sum2 = 0;
|
776
818
|
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
@@ -948,8 +990,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
948
990
|
const float * y1 = yy + i*QK_K + y_offset;
|
949
991
|
const float * y2 = y1 + 128;
|
950
992
|
|
951
|
-
const float dall = x[i].
|
952
|
-
const float dmin = x[i].
|
993
|
+
const float dall = x[i].dm.x;
|
994
|
+
const float dmin = x[i].dm.y;
|
953
995
|
|
954
996
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
955
997
|
aux[0] = a[im+0] & kmask1;
|
@@ -1081,8 +1123,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1081
1123
|
const float * y1 = yy + i*QK_K + y_offset;
|
1082
1124
|
const float * y2 = y1 + 128;
|
1083
1125
|
|
1084
|
-
const float dall = x[i].
|
1085
|
-
const float dmin = x[i].
|
1126
|
+
const float dall = x[i].dm.x;
|
1127
|
+
const float dmin = x[i].dm.y;
|
1086
1128
|
|
1087
1129
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1088
1130
|
aux[0] = a[im+0] & kmask1;
|
@@ -1270,19 +1312,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1270
1312
|
v.y = x[ib + iqs + 1];
|
1271
1313
|
}
|
1272
1314
|
|
1273
|
-
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int
|
1274
|
-
const int
|
1315
|
+
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
1316
|
+
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
1275
1317
|
|
1276
|
-
if (
|
1318
|
+
if (ix >= kx_padded) {
|
1277
1319
|
return;
|
1278
1320
|
}
|
1279
1321
|
|
1322
|
+
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
|
1323
|
+
|
1324
|
+
const int i_padded = iy*kx_padded + ix;
|
1325
|
+
|
1280
1326
|
block_q8_1 * y = (block_q8_1 *) vy;
|
1281
1327
|
|
1282
|
-
const int ib =
|
1283
|
-
const int iqs =
|
1328
|
+
const int ib = i_padded / QK8_1; // block index
|
1329
|
+
const int iqs = i_padded % QK8_1; // quant index
|
1284
1330
|
|
1285
|
-
const float xi =
|
1331
|
+
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
1286
1332
|
float amax = fabsf(xi);
|
1287
1333
|
float sum = xi;
|
1288
1334
|
|
@@ -1301,8 +1347,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1301
1347
|
return;
|
1302
1348
|
}
|
1303
1349
|
|
1304
|
-
y[ib].
|
1305
|
-
y[ib].
|
1350
|
+
y[ib].ds.x = d;
|
1351
|
+
y[ib].ds.y = sum;
|
1306
1352
|
}
|
1307
1353
|
|
1308
1354
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -1326,485 +1372,1876 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
|
|
1326
1372
|
y[iybs + iqs + y_offset] = v.y;
|
1327
1373
|
}
|
1328
1374
|
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1375
|
+
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
1376
|
+
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
1377
|
+
|
1378
|
+
#define VDR_Q4_0_Q8_1_MMVQ 2
|
1379
|
+
#define VDR_Q4_0_Q8_1_MMQ 4
|
1380
|
+
|
1381
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
|
1382
|
+
const int * v, const int * u, const float & d4, const half2 & ds8) {
|
1333
1383
|
|
1334
|
-
|
1335
|
-
|
1336
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1337
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
1384
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1385
|
+
int sumi = 0;
|
1338
1386
|
|
1339
|
-
|
1387
|
+
#pragma unroll
|
1388
|
+
for (int i = 0; i < vdr; ++i) {
|
1389
|
+
const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
|
1390
|
+
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
1340
1391
|
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1392
|
+
// SIMD dot product of quantized values
|
1393
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi);
|
1394
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1395
|
+
}
|
1344
1396
|
|
1345
|
-
|
1346
|
-
int sumi = __dp4a(vi0, ui0, 0);
|
1347
|
-
sumi = __dp4a(vi1, ui1, sumi);
|
1397
|
+
const float2 ds8f = __half22float2(ds8);
|
1348
1398
|
|
1349
|
-
|
1399
|
+
// second part effectively subtracts 8 from each quant value
|
1400
|
+
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1350
1401
|
#else
|
1351
1402
|
return 0.0f; // only to satisfy the compiler
|
1352
1403
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1353
1404
|
}
|
1354
1405
|
|
1355
|
-
|
1356
|
-
|
1357
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1358
|
-
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1406
|
+
#define VDR_Q4_1_Q8_1_MMVQ 2
|
1407
|
+
#define VDR_Q4_1_Q8_1_MMQ 4
|
1359
1408
|
|
1360
|
-
|
1361
|
-
const int
|
1362
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
1409
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
|
1410
|
+
const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
|
1363
1411
|
|
1364
|
-
|
1365
|
-
|
1366
|
-
const float s = bq8_1->s;
|
1412
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1413
|
+
int sumi = 0;
|
1367
1414
|
|
1368
|
-
|
1369
|
-
|
1415
|
+
#pragma unroll
|
1416
|
+
for (int i = 0; i < vdr; ++i) {
|
1417
|
+
const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
|
1418
|
+
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
|
1370
1419
|
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1420
|
+
// SIMD dot product of quantized values
|
1421
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi);
|
1422
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1423
|
+
}
|
1374
1424
|
|
1375
|
-
|
1425
|
+
#ifdef GGML_CUDA_F16
|
1426
|
+
const float2 tmp = __half22float2(__hmul2(dm4, ds8));
|
1427
|
+
const float d4d8 = tmp.x;
|
1428
|
+
const float m4s8 = tmp.y;
|
1429
|
+
#else
|
1430
|
+
const float2 dm4f = __half22float2(dm4);
|
1431
|
+
const float2 ds8f = __half22float2(ds8);
|
1432
|
+
const float d4d8 = dm4f.x * ds8f.x;
|
1433
|
+
const float m4s8 = dm4f.y * ds8f.y;
|
1434
|
+
#endif // GGML_CUDA_F16
|
1435
|
+
|
1436
|
+
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1437
|
+
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1376
1438
|
#else
|
1377
1439
|
return 0.0f; // only to satisfy the compiler
|
1378
1440
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1379
1441
|
}
|
1380
1442
|
|
1381
|
-
|
1382
|
-
|
1443
|
+
#define VDR_Q5_0_Q8_1_MMVQ 2
|
1444
|
+
#define VDR_Q5_0_Q8_1_MMQ 4
|
1445
|
+
|
1446
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
|
1447
|
+
const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
|
1448
|
+
|
1383
1449
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1384
|
-
|
1450
|
+
int sumi = 0;
|
1451
|
+
|
1452
|
+
#pragma unroll
|
1453
|
+
for (int i = 0; i < vdr; ++i) {
|
1454
|
+
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1455
|
+
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
1456
|
+
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
1457
|
+
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
1458
|
+
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
1459
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
|
1460
|
+
|
1461
|
+
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
1462
|
+
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
1463
|
+
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
1464
|
+
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
1465
|
+
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
1466
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1467
|
+
}
|
1468
|
+
|
1469
|
+
const float2 ds8f = __half22float2(ds8);
|
1385
1470
|
|
1386
|
-
|
1387
|
-
|
1388
|
-
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1389
|
-
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1390
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1391
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
1392
|
-
|
1393
|
-
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
1394
|
-
|
1395
|
-
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1396
|
-
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1397
|
-
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1398
|
-
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1399
|
-
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1400
|
-
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
1401
|
-
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1402
|
-
|
1403
|
-
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1404
|
-
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1405
|
-
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1406
|
-
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1407
|
-
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1408
|
-
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
1409
|
-
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1410
|
-
|
1411
|
-
return sumi*d;
|
1471
|
+
// second part effectively subtracts 16 from each quant value
|
1472
|
+
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
1412
1473
|
#else
|
1413
1474
|
return 0.0f; // only to satisfy the compiler
|
1414
1475
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1415
1476
|
}
|
1416
1477
|
|
1417
|
-
|
1418
|
-
|
1478
|
+
#define VDR_Q5_1_Q8_1_MMVQ 2
|
1479
|
+
#define VDR_Q5_1_Q8_1_MMQ 4
|
1480
|
+
|
1481
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
|
1482
|
+
const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
|
1483
|
+
|
1419
1484
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1420
|
-
|
1485
|
+
int sumi = 0;
|
1486
|
+
|
1487
|
+
#pragma unroll
|
1488
|
+
for (int i = 0; i < vdr; ++i) {
|
1489
|
+
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1490
|
+
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
1491
|
+
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
|
1492
|
+
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
|
1493
|
+
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
|
1494
|
+
sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
|
1495
|
+
|
1496
|
+
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
|
1497
|
+
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
|
1498
|
+
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
|
1499
|
+
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
|
1500
|
+
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
|
1501
|
+
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1502
|
+
}
|
1503
|
+
|
1504
|
+
#ifdef GGML_CUDA_F16
|
1505
|
+
const float2 tmp = __half22float2(__hmul2(dm5, ds8));
|
1506
|
+
const float d5d8 = tmp.x;
|
1507
|
+
const float m5s8 = tmp.y;
|
1508
|
+
#else
|
1509
|
+
const float2 dm5f = __half22float2(dm5);
|
1510
|
+
const float2 ds8f = __half22float2(ds8);
|
1511
|
+
const float d5d8 = dm5f.x * ds8f.x;
|
1512
|
+
const float m5s8 = dm5f.y * ds8f.y;
|
1513
|
+
#endif // GGML_CUDA_F16
|
1514
|
+
|
1515
|
+
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
1516
|
+
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1421
1517
|
|
1422
|
-
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
1423
|
-
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
1424
|
-
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
1425
|
-
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1426
|
-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
1427
|
-
|
1428
|
-
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
1429
|
-
const float m = bq5_1->m;
|
1430
|
-
const float s = bq8_1->s;
|
1431
|
-
|
1432
|
-
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
1433
|
-
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
1434
|
-
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
1435
|
-
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
1436
|
-
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
1437
|
-
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
1438
|
-
|
1439
|
-
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
1440
|
-
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
1441
|
-
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
1442
|
-
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
1443
|
-
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
1444
|
-
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
1445
|
-
|
1446
|
-
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
1447
1518
|
#else
|
1448
1519
|
return 0.0f; // only to satisfy the compiler
|
1449
1520
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1450
1521
|
}
|
1451
1522
|
|
1452
|
-
|
1453
|
-
|
1454
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1455
|
-
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
1523
|
+
#define VDR_Q8_0_Q8_1_MMVQ 2
|
1524
|
+
#define VDR_Q8_0_Q8_1_MMQ 8
|
1456
1525
|
|
1457
|
-
|
1458
|
-
|
1459
|
-
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
1526
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
|
1527
|
+
const int * v, const int * u, const float & d8_0, const float & d8_1) {
|
1460
1528
|
|
1461
|
-
|
1529
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1530
|
+
int sumi = 0;
|
1462
1531
|
|
1463
|
-
|
1464
|
-
int
|
1532
|
+
#pragma unroll
|
1533
|
+
for (int i = 0; i < vdr; ++i) {
|
1534
|
+
// SIMD dot product of quantized values
|
1535
|
+
sumi = __dp4a(v[i], u[i], sumi);
|
1536
|
+
}
|
1465
1537
|
|
1466
|
-
return sumi
|
1538
|
+
return d8_0*d8_1 * sumi;
|
1467
1539
|
#else
|
1468
1540
|
return 0.0f; // only to satisfy the compiler
|
1469
1541
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1470
1542
|
}
|
1471
1543
|
|
1472
|
-
static __device__ __forceinline__ float
|
1473
|
-
const
|
1544
|
+
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
|
1545
|
+
const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
|
1474
1546
|
|
1475
1547
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1476
|
-
|
1548
|
+
int sumi = 0;
|
1477
1549
|
|
1478
|
-
|
1479
|
-
|
1550
|
+
#pragma unroll
|
1551
|
+
for (int i = 0; i < vdr; ++i) {
|
1552
|
+
// SIMD dot product of quantized values
|
1553
|
+
sumi = __dp4a(v[i], u[i], sumi);
|
1554
|
+
}
|
1480
1555
|
|
1481
|
-
|
1482
|
-
|
1556
|
+
#ifdef GGML_CUDA_F16
|
1557
|
+
const float2 tmp = __half22float2(__hmul2(dm8, ds8));
|
1558
|
+
const float d8d8 = tmp.x;
|
1559
|
+
const float m8s8 = tmp.y;
|
1560
|
+
#else
|
1561
|
+
const float2 dm8f = __half22float2(dm8);
|
1562
|
+
const float2 ds8f = __half22float2(ds8);
|
1563
|
+
const float d8d8 = dm8f.x * ds8f.x;
|
1564
|
+
const float m8s8 = dm8f.y * ds8f.y;
|
1565
|
+
#endif // GGML_CUDA_F16
|
1566
|
+
|
1567
|
+
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1568
|
+
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1569
|
+
#else
|
1570
|
+
return 0.0f; // only to satisfy the compiler
|
1571
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1572
|
+
}
|
1483
1573
|
|
1484
|
-
|
1485
|
-
|
1574
|
+
#define VDR_Q2_K_Q8_1_MMVQ 1
|
1575
|
+
#define VDR_Q2_K_Q8_1_MMQ 2
|
1486
1576
|
|
1487
|
-
|
1577
|
+
// contiguous v/x values
|
1578
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
1579
|
+
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1580
|
+
const half2 & dm2, const float * __restrict__ d8) {
|
1488
1581
|
|
1489
|
-
|
1490
|
-
|
1582
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1583
|
+
float sumf_d = 0.0f;
|
1584
|
+
float sumf_m = 0.0f;
|
1491
1585
|
|
1492
|
-
|
1493
|
-
|
1586
|
+
#pragma unroll
|
1587
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1588
|
+
const int sc = scales[2*i];
|
1494
1589
|
|
1495
1590
|
const int vi = (v >> (2*i)) & 0x03030303;
|
1496
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1497
1591
|
|
1498
|
-
sumf_d +=
|
1499
|
-
|
1592
|
+
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
1593
|
+
|
1594
|
+
// fill int with 4x m
|
1595
|
+
int m = sc >> 4;
|
1596
|
+
m |= m << 8;
|
1597
|
+
m |= m << 16;
|
1598
|
+
sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
1500
1599
|
}
|
1501
1600
|
|
1502
|
-
|
1601
|
+
const float2 dm2f = __half22float2(dm2);
|
1602
|
+
|
1603
|
+
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
1503
1604
|
#else
|
1504
1605
|
return 0.0f; // only to satisfy the compiler
|
1505
1606
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1506
1607
|
}
|
1507
1608
|
|
1508
|
-
|
1509
|
-
|
1609
|
+
// contiguous u/y values
|
1610
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
1611
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1612
|
+
const half2 & dm2, const float & d8) {
|
1510
1613
|
|
1511
1614
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1512
|
-
|
1615
|
+
int sumi_d = 0;
|
1616
|
+
int sumi_m = 0;
|
1513
1617
|
|
1514
|
-
|
1515
|
-
|
1618
|
+
#pragma unroll
|
1619
|
+
for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
|
1620
|
+
int sumi_d_sc = 0;
|
1516
1621
|
|
1517
|
-
|
1622
|
+
const int sc = scales[i0 / (QI8_1/2)];
|
1518
1623
|
|
1519
|
-
|
1624
|
+
// fill int with 4x m
|
1625
|
+
int m = sc >> 4;
|
1626
|
+
m |= m << 8;
|
1627
|
+
m |= m << 16;
|
1628
|
+
|
1629
|
+
#pragma unroll
|
1630
|
+
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
1631
|
+
sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
|
1632
|
+
sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
|
1633
|
+
}
|
1634
|
+
|
1635
|
+
sumi_d += sumi_d_sc * (sc & 0xF);
|
1636
|
+
}
|
1637
|
+
|
1638
|
+
const float2 dm2f = __half22float2(dm2);
|
1639
|
+
|
1640
|
+
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
1641
|
+
#else
|
1642
|
+
return 0.0f; // only to satisfy the compiler
|
1643
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1644
|
+
}
|
1520
1645
|
|
1521
|
-
|
1522
|
-
|
1646
|
+
#define VDR_Q3_K_Q8_1_MMVQ 1
|
1647
|
+
#define VDR_Q3_K_Q8_1_MMQ 2
|
1523
1648
|
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1649
|
+
// contiguous v/x values
|
1650
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
1651
|
+
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1652
|
+
const int & scale_offset, const float & d3, const float * __restrict__ d8) {
|
1653
|
+
|
1654
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1655
|
+
float sumf = 0.0f;
|
1528
1656
|
|
1657
|
+
#pragma unroll
|
1529
1658
|
for (int i = 0; i < QR3_K; ++i) {
|
1530
1659
|
const int isc = scale_offset + 2*i;
|
1531
1660
|
|
1532
1661
|
const int isc_low = isc % (QK_K/32);
|
1533
1662
|
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1534
|
-
const int sc_low = (
|
1663
|
+
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
1535
1664
|
|
1536
1665
|
const int isc_high = isc % (QK_K/64);
|
1537
1666
|
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1538
|
-
const int sc_high = ((
|
1667
|
+
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1539
1668
|
|
1540
1669
|
const int sc = (sc_low | sc_high) - 32;
|
1541
1670
|
|
1542
|
-
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1543
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1544
|
-
const float d8i = bq8i->d;
|
1545
|
-
|
1546
1671
|
const int vil = (vl >> (2*i)) & 0x03030303;
|
1547
1672
|
|
1548
1673
|
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1549
1674
|
|
1550
1675
|
const int vi = __vsubss4(vil, vih);
|
1551
1676
|
|
1552
|
-
sumf +=
|
1677
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
1553
1678
|
}
|
1554
1679
|
|
1555
|
-
return
|
1680
|
+
return d3 * sumf;
|
1556
1681
|
#else
|
1557
1682
|
return 0.0f; // only to satisfy the compiler
|
1558
1683
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1559
1684
|
}
|
1560
1685
|
|
1561
|
-
|
1562
|
-
|
1686
|
+
// contiguous u/y values
|
1687
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
1688
|
+
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
1689
|
+
const float & d3, const float & d8) {
|
1563
1690
|
|
1564
1691
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1565
|
-
|
1692
|
+
int sumi = 0;
|
1566
1693
|
|
1567
|
-
|
1568
|
-
|
1694
|
+
#pragma unroll
|
1695
|
+
for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
|
1696
|
+
int sumi_sc = 0;
|
1569
1697
|
|
1570
|
-
|
1698
|
+
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
1699
|
+
sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
|
1700
|
+
}
|
1571
1701
|
|
1572
|
-
|
1573
|
-
|
1702
|
+
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
1703
|
+
}
|
1574
1704
|
|
1575
|
-
|
1576
|
-
|
1705
|
+
return d3*d8 * sumi;
|
1706
|
+
#else
|
1707
|
+
return 0.0f; // only to satisfy the compiler
|
1708
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1709
|
+
}
|
1577
1710
|
|
1578
|
-
|
1579
|
-
|
1580
|
-
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
1581
|
-
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
1711
|
+
#define VDR_Q4_K_Q8_1_MMVQ 2
|
1712
|
+
#define VDR_Q4_K_Q8_1_MMQ 8
|
1582
1713
|
|
1583
|
-
|
1584
|
-
|
1585
|
-
const int
|
1714
|
+
// contiguous v/x values
|
1715
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
1716
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1717
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
1586
1718
|
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1590
|
-
if (j < 2) {
|
1591
|
-
aux[0] = scales[j+0] & 0x3f3f;
|
1592
|
-
aux[1] = scales[j+2] & 0x3f3f;
|
1593
|
-
} else {
|
1594
|
-
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
1595
|
-
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
1596
|
-
}
|
1597
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
1598
|
-
const uint8_t * m = sc + 2;
|
1719
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1720
|
+
float sumf_d = 0.0f;
|
1721
|
+
float sumf_m = 0.0f;
|
1599
1722
|
|
1723
|
+
#pragma unroll
|
1600
1724
|
for (int i = 0; i < QR4_K; ++i) {
|
1725
|
+
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
|
1726
|
+
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
|
1601
1727
|
|
1602
|
-
const
|
1603
|
-
const
|
1604
|
-
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1605
|
-
const int ui1 = q8[0];
|
1606
|
-
const int ui2 = q8[4];
|
1607
|
-
|
1608
|
-
const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
|
1609
|
-
const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
|
1610
|
-
|
1611
|
-
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
1612
|
-
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1728
|
+
const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
|
1729
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
|
1613
1730
|
|
1614
|
-
sumf_d +=
|
1615
|
-
sumf_m +=
|
1731
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
1732
|
+
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
1616
1733
|
}
|
1617
1734
|
|
1618
|
-
|
1619
|
-
|
1620
|
-
#else
|
1735
|
+
const float2 dm4f = __half22float2(dm4);
|
1621
1736
|
|
1622
|
-
|
1623
|
-
const uint8_t * s = (const uint8_t *)aux16;
|
1737
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1624
1738
|
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1739
|
+
#else
|
1740
|
+
return 0.0f; // only to satisfy the compiler
|
1741
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1742
|
+
}
|
1628
1743
|
|
1629
|
-
|
1630
|
-
|
1744
|
+
// contiguous u/y values
|
1745
|
+
// also used for q5_K
|
1746
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
1747
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1748
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1631
1749
|
|
1632
|
-
|
1633
|
-
|
1750
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1751
|
+
float sumf_d = 0.0f;
|
1752
|
+
float sumf_m = 0.0f;
|
1634
1753
|
|
1635
|
-
|
1636
|
-
|
1637
|
-
|
1638
|
-
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
1754
|
+
#pragma unroll
|
1755
|
+
for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
|
1756
|
+
int sumi_d = 0;
|
1639
1757
|
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1758
|
+
#pragma unroll
|
1759
|
+
for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
|
1760
|
+
sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
|
1761
|
+
sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
|
1762
|
+
}
|
1643
1763
|
|
1644
|
-
|
1645
|
-
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
1646
|
-
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1647
|
-
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
1764
|
+
const float2 ds8f = __half22float2(ds8[i0 / 4]);
|
1648
1765
|
|
1649
|
-
|
1650
|
-
|
1766
|
+
sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
|
1767
|
+
sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
|
1768
|
+
}
|
1651
1769
|
|
1652
|
-
|
1770
|
+
const float2 dm4f = __half22float2(dm4);
|
1653
1771
|
|
1654
|
-
|
1772
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1655
1773
|
|
1656
1774
|
#else
|
1657
1775
|
return 0.0f; // only to satisfy the compiler
|
1658
1776
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1659
1777
|
}
|
1660
1778
|
|
1661
|
-
|
1662
|
-
|
1663
|
-
|
1664
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1665
|
-
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
1666
|
-
|
1667
|
-
#ifndef GGML_QKK_64
|
1779
|
+
#define VDR_Q5_K_Q8_1_MMVQ 2
|
1780
|
+
#define VDR_Q5_K_Q8_1_MMQ 8
|
1668
1781
|
|
1669
|
-
|
1670
|
-
|
1671
|
-
const int *
|
1782
|
+
// contiguous v/x values
|
1783
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
1784
|
+
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1785
|
+
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
1672
1786
|
|
1787
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1673
1788
|
float sumf_d = 0.0f;
|
1674
1789
|
float sumf_m = 0.0f;
|
1675
1790
|
|
1676
|
-
|
1677
|
-
|
1791
|
+
#pragma unroll
|
1792
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1793
|
+
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
1794
|
+
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
1678
1795
|
|
1679
|
-
|
1680
|
-
|
1796
|
+
const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
|
1797
|
+
const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
|
1681
1798
|
|
1682
|
-
|
1683
|
-
|
1799
|
+
const int v0i = vl0i | vh0i;
|
1800
|
+
const int v1i = vl1i | vh1i;
|
1684
1801
|
|
1685
|
-
|
1686
|
-
|
1687
|
-
const int j = bq8_offset/2;
|
1688
|
-
if (j < 2) {
|
1689
|
-
aux[0] = scales[j+0] & 0x3f3f;
|
1690
|
-
aux[1] = scales[j+2] & 0x3f3f;
|
1691
|
-
} else {
|
1692
|
-
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
1693
|
-
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
1694
|
-
}
|
1695
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
1696
|
-
const uint8_t * m = sc + 2;
|
1802
|
+
const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
1803
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
1697
1804
|
|
1698
|
-
|
1805
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
1806
|
+
sumf_m += d8[i] * (dot2 * m[i]);
|
1699
1807
|
|
1700
|
-
|
1701
|
-
const float d8i = bq8i->d;
|
1702
|
-
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1703
|
-
const int ui1 = q8[0];
|
1704
|
-
const int ui2 = q8[4];
|
1808
|
+
}
|
1705
1809
|
|
1706
|
-
|
1707
|
-
const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
|
1810
|
+
const float2 dm5f = __half22float2(dm5);
|
1708
1811
|
|
1709
|
-
|
1710
|
-
const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
|
1812
|
+
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
1711
1813
|
|
1712
|
-
|
1713
|
-
|
1814
|
+
#else
|
1815
|
+
return 0.0f; // only to satisfy the compiler
|
1816
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1817
|
+
}
|
1714
1818
|
|
1715
|
-
|
1716
|
-
|
1819
|
+
#define VDR_Q6_K_Q8_1_MMVQ 1
|
1820
|
+
#define VDR_Q6_K_Q8_1_MMQ 8
|
1717
1821
|
|
1718
|
-
|
1719
|
-
|
1822
|
+
// contiguous v/x values
|
1823
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
1824
|
+
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
1825
|
+
const float & d, const float * __restrict__ d8) {
|
1720
1826
|
|
1721
|
-
|
1827
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1828
|
+
float sumf = 0.0f;
|
1722
1829
|
|
1723
|
-
|
1830
|
+
#pragma unroll
|
1831
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1832
|
+
const int sc = scales[4*i];
|
1724
1833
|
|
1725
|
-
|
1834
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1726
1835
|
|
1727
|
-
|
1836
|
+
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
1728
1837
|
|
1729
|
-
|
1838
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1730
1839
|
|
1731
|
-
|
1732
|
-
|
1840
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
1841
|
+
}
|
1733
1842
|
|
1734
|
-
|
1735
|
-
|
1736
|
-
|
1737
|
-
|
1843
|
+
return d*sumf;
|
1844
|
+
#else
|
1845
|
+
return 0.0f; // only to satisfy the compiler
|
1846
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1847
|
+
}
|
1738
1848
|
|
1739
|
-
|
1740
|
-
|
1741
|
-
const int
|
1849
|
+
// contiguous u/y values
|
1850
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
1851
|
+
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
|
1852
|
+
const float & d6, const float * __restrict__ d8) {
|
1742
1853
|
|
1743
|
-
|
1744
|
-
|
1745
|
-
const int in = step%8; // 0, 4, 0, 4
|
1746
|
-
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
1854
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1855
|
+
float sumf_d = 0.0f;
|
1747
1856
|
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
1857
|
+
#pragma unroll
|
1858
|
+
for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
|
1859
|
+
int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
|
1752
1860
|
|
1753
|
-
|
1754
|
-
|
1861
|
+
#pragma unroll
|
1862
|
+
for (int i = i0; i < i0 + 2; ++i) {
|
1863
|
+
sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
|
1864
|
+
sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
|
1755
1865
|
|
1756
|
-
|
1866
|
+
sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
|
1867
|
+
sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
|
1868
|
+
}
|
1757
1869
|
|
1758
|
-
|
1870
|
+
sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
|
1871
|
+
}
|
1872
|
+
|
1873
|
+
return d6 * sumf_d;
|
1759
1874
|
|
1760
1875
|
#else
|
1761
1876
|
return 0.0f; // only to satisfy the compiler
|
1762
1877
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1763
1878
|
}
|
1764
1879
|
|
1765
|
-
static __device__ __forceinline__ float
|
1766
|
-
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
1880
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1881
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1882
|
+
|
1883
|
+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
1884
|
+
|
1885
|
+
int v[VDR_Q4_0_Q8_1_MMVQ];
|
1886
|
+
int u[2*VDR_Q4_0_Q8_1_MMVQ];
|
1887
|
+
|
1888
|
+
#pragma unroll
|
1889
|
+
for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
|
1890
|
+
v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
|
1891
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1892
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
|
1893
|
+
}
|
1894
|
+
|
1895
|
+
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
1896
|
+
}
|
1897
|
+
|
1898
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1899
|
+
|
1900
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
1901
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
1902
|
+
|
1903
|
+
*x_ql = tile_x_qs;
|
1904
|
+
*x_dm = (half2 *) tile_x_d;
|
1905
|
+
}
|
1906
|
+
|
1907
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1908
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1909
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1910
|
+
|
1911
|
+
__builtin_assume(i_offset >= 0);
|
1912
|
+
__builtin_assume(i_offset < nwarps);
|
1913
|
+
__builtin_assume(k >= 0);
|
1914
|
+
__builtin_assume(k < WARP_SIZE);
|
1915
|
+
|
1916
|
+
const int kbx = k / QI4_0;
|
1917
|
+
const int kqsx = k % QI4_0;
|
1918
|
+
|
1919
|
+
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
1920
|
+
|
1921
|
+
float * x_dmf = (float *) x_dm;
|
1922
|
+
|
1923
|
+
#pragma unroll
|
1924
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1925
|
+
int i = i0 + i_offset;
|
1926
|
+
|
1927
|
+
if (need_check) {
|
1928
|
+
i = min(i, i_max);
|
1929
|
+
}
|
1930
|
+
|
1931
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1932
|
+
|
1933
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
1934
|
+
// x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1935
|
+
}
|
1936
|
+
|
1937
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
|
1938
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1939
|
+
|
1940
|
+
#pragma unroll
|
1941
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
|
1942
|
+
int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1943
|
+
|
1944
|
+
if (need_check) {
|
1945
|
+
i = min(i, i_max);
|
1946
|
+
}
|
1947
|
+
|
1948
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1949
|
+
|
1950
|
+
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
|
1951
|
+
}
|
1952
|
+
}
|
1953
|
+
|
1954
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
1955
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1956
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1957
|
+
|
1958
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1959
|
+
const float * x_dmf = (float *) x_dm;
|
1960
|
+
|
1961
|
+
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
1962
|
+
|
1963
|
+
#pragma unroll
|
1964
|
+
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
|
1965
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
1966
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
|
1967
|
+
}
|
1968
|
+
|
1969
|
+
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
|
1970
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
|
1971
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1972
|
+
}
|
1973
|
+
|
1974
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
1975
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1976
|
+
|
1977
|
+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
1978
|
+
|
1979
|
+
int v[VDR_Q4_1_Q8_1_MMVQ];
|
1980
|
+
int u[2*VDR_Q4_1_Q8_1_MMVQ];
|
1981
|
+
|
1982
|
+
#pragma unroll
|
1983
|
+
for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
|
1984
|
+
v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
|
1985
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1986
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
|
1987
|
+
}
|
1988
|
+
|
1989
|
+
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
1990
|
+
}
|
1991
|
+
|
1992
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1993
|
+
|
1994
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
1995
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
1996
|
+
|
1997
|
+
*x_ql = tile_x_qs;
|
1998
|
+
*x_dm = tile_x_dm;
|
1999
|
+
}
|
2000
|
+
|
2001
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2002
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2003
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2004
|
+
|
2005
|
+
__builtin_assume(i_offset >= 0);
|
2006
|
+
__builtin_assume(i_offset < nwarps);
|
2007
|
+
__builtin_assume(k >= 0);
|
2008
|
+
__builtin_assume(k < WARP_SIZE);
|
2009
|
+
|
2010
|
+
const int kbx = k / QI4_1;
|
2011
|
+
const int kqsx = k % QI4_1;
|
2012
|
+
|
2013
|
+
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
2014
|
+
|
2015
|
+
#pragma unroll
|
2016
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2017
|
+
int i = i0 + i_offset;
|
2018
|
+
|
2019
|
+
if (need_check) {
|
2020
|
+
i = min(i, i_max);
|
2021
|
+
}
|
2022
|
+
|
2023
|
+
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
|
2024
|
+
|
2025
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2026
|
+
}
|
2027
|
+
|
2028
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
|
2029
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2030
|
+
|
2031
|
+
#pragma unroll
|
2032
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
|
2033
|
+
int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
|
2034
|
+
|
2035
|
+
if (need_check) {
|
2036
|
+
i = min(i, i_max);
|
2037
|
+
}
|
2038
|
+
|
2039
|
+
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2040
|
+
|
2041
|
+
x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
|
2042
|
+
}
|
2043
|
+
}
|
2044
|
+
|
2045
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
2046
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2047
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2048
|
+
|
2049
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2050
|
+
|
2051
|
+
int u[2*VDR_Q4_1_Q8_1_MMQ];
|
2052
|
+
|
2053
|
+
#pragma unroll
|
2054
|
+
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
|
2055
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2056
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
|
2057
|
+
}
|
2058
|
+
|
2059
|
+
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
|
2060
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
|
2061
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
2062
|
+
}
|
2063
|
+
|
2064
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
2065
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2066
|
+
|
2067
|
+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
2068
|
+
|
2069
|
+
int vl[VDR_Q5_0_Q8_1_MMVQ];
|
2070
|
+
int vh[VDR_Q5_0_Q8_1_MMVQ];
|
2071
|
+
int u[2*VDR_Q5_0_Q8_1_MMVQ];
|
2072
|
+
|
2073
|
+
#pragma unroll
|
2074
|
+
for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
|
2075
|
+
vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
|
2076
|
+
vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
|
2077
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
2078
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
|
2079
|
+
}
|
2080
|
+
|
2081
|
+
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
|
2082
|
+
}
|
2083
|
+
|
2084
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2085
|
+
|
2086
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2087
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
2088
|
+
|
2089
|
+
*x_ql = tile_x_ql;
|
2090
|
+
*x_dm = (half2 *) tile_x_d;
|
2091
|
+
}
|
2092
|
+
|
2093
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2094
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2095
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2096
|
+
|
2097
|
+
__builtin_assume(i_offset >= 0);
|
2098
|
+
__builtin_assume(i_offset < nwarps);
|
2099
|
+
__builtin_assume(k >= 0);
|
2100
|
+
__builtin_assume(k < WARP_SIZE);
|
2101
|
+
|
2102
|
+
const int kbx = k / QI5_0;
|
2103
|
+
const int kqsx = k % QI5_0;
|
2104
|
+
|
2105
|
+
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
2106
|
+
|
2107
|
+
#pragma unroll
|
2108
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2109
|
+
int i = i0 + i_offset;
|
2110
|
+
|
2111
|
+
if (need_check) {
|
2112
|
+
i = min(i, i_max);
|
2113
|
+
}
|
2114
|
+
|
2115
|
+
const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
2116
|
+
|
2117
|
+
const int ql = get_int_from_uint8(bxi->qs, kqsx);
|
2118
|
+
const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
|
2119
|
+
|
2120
|
+
int qs0 = (ql >> 0) & 0x0F0F0F0F;
|
2121
|
+
qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
|
2122
|
+
qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
|
2123
|
+
qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
|
2124
|
+
qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
|
2125
|
+
qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
|
2126
|
+
|
2127
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
|
2128
|
+
|
2129
|
+
int qs1 = (ql >> 4) & 0x0F0F0F0F;
|
2130
|
+
qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
|
2131
|
+
qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
|
2132
|
+
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
2133
|
+
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
2134
|
+
qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
|
2135
|
+
|
2136
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
|
2137
|
+
}
|
2138
|
+
|
2139
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
|
2140
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2141
|
+
float * x_dmf = (float *) x_dm;
|
2142
|
+
|
2143
|
+
#pragma unroll
|
2144
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
|
2145
|
+
int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
|
2146
|
+
|
2147
|
+
if (need_check) {
|
2148
|
+
i = min(i, i_max);
|
2149
|
+
}
|
2150
|
+
|
2151
|
+
const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2152
|
+
|
2153
|
+
x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
|
2154
|
+
}
|
2155
|
+
}
|
2156
|
+
|
2157
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
2158
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2159
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2160
|
+
|
2161
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2162
|
+
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
2163
|
+
const float * x_dmf = (const float *) x_dm;
|
2164
|
+
const float * y_df = (const float *) y_ds;
|
2165
|
+
|
2166
|
+
int u[2*VDR_Q5_0_Q8_1_MMQ];
|
2167
|
+
|
2168
|
+
#pragma unroll
|
2169
|
+
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
|
2170
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2171
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
|
2172
|
+
}
|
2173
|
+
|
2174
|
+
return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
|
2175
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
2176
|
+
}
|
2177
|
+
|
2178
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
2179
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2180
|
+
|
2181
|
+
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
2182
|
+
|
2183
|
+
int vl[VDR_Q5_1_Q8_1_MMVQ];
|
2184
|
+
int vh[VDR_Q5_1_Q8_1_MMVQ];
|
2185
|
+
int u[2*VDR_Q5_1_Q8_1_MMVQ];
|
2186
|
+
|
2187
|
+
#pragma unroll
|
2188
|
+
for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
|
2189
|
+
vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
|
2190
|
+
vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
|
2191
|
+
u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
2192
|
+
u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
|
2193
|
+
}
|
2194
|
+
|
2195
|
+
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
|
2196
|
+
}
|
2197
|
+
|
2198
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2199
|
+
|
2200
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2201
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
2202
|
+
|
2203
|
+
*x_ql = tile_x_ql;
|
2204
|
+
*x_dm = tile_x_dm;
|
2205
|
+
}
|
2206
|
+
|
2207
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2208
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2209
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2210
|
+
|
2211
|
+
__builtin_assume(i_offset >= 0);
|
2212
|
+
__builtin_assume(i_offset < nwarps);
|
2213
|
+
__builtin_assume(k >= 0);
|
2214
|
+
__builtin_assume(k < WARP_SIZE);
|
2215
|
+
|
2216
|
+
const int kbx = k / QI5_1;
|
2217
|
+
const int kqsx = k % QI5_1;
|
2218
|
+
|
2219
|
+
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
2220
|
+
|
2221
|
+
#pragma unroll
|
2222
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2223
|
+
int i = i0 + i_offset;
|
2224
|
+
|
2225
|
+
if (need_check) {
|
2226
|
+
i = min(i, i_max);
|
2227
|
+
}
|
2228
|
+
|
2229
|
+
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
|
2230
|
+
|
2231
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2232
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
|
2233
|
+
|
2234
|
+
int qs0 = (ql >> 0) & 0x0F0F0F0F;
|
2235
|
+
qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
|
2236
|
+
qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
|
2237
|
+
qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
|
2238
|
+
qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
|
2239
|
+
|
2240
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
|
2241
|
+
|
2242
|
+
int qs1 = (ql >> 4) & 0x0F0F0F0F;
|
2243
|
+
qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
|
2244
|
+
qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
|
2245
|
+
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
2246
|
+
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
2247
|
+
|
2248
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
|
2249
|
+
}
|
2250
|
+
|
2251
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
|
2252
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2253
|
+
|
2254
|
+
#pragma unroll
|
2255
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
|
2256
|
+
int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
|
2257
|
+
|
2258
|
+
if (need_check) {
|
2259
|
+
i = min(i, i_max);
|
2260
|
+
}
|
2261
|
+
|
2262
|
+
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2263
|
+
|
2264
|
+
x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
|
2265
|
+
}
|
2266
|
+
}
|
2267
|
+
|
2268
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
2269
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2270
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2271
|
+
|
2272
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2273
|
+
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
2274
|
+
|
2275
|
+
int u[2*VDR_Q5_1_Q8_1_MMQ];
|
2276
|
+
|
2277
|
+
#pragma unroll
|
2278
|
+
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
|
2279
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2280
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
|
2281
|
+
}
|
2282
|
+
|
2283
|
+
return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
|
2284
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
2285
|
+
}
|
2286
|
+
|
2287
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
2288
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2289
|
+
|
2290
|
+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
2291
|
+
|
2292
|
+
int v[VDR_Q8_0_Q8_1_MMVQ];
|
2293
|
+
int u[VDR_Q8_0_Q8_1_MMVQ];
|
2294
|
+
|
2295
|
+
#pragma unroll
|
2296
|
+
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
|
2297
|
+
v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
|
2298
|
+
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
2299
|
+
}
|
2300
|
+
|
2301
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
|
2302
|
+
}
|
2303
|
+
|
2304
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2305
|
+
|
2306
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2307
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
2308
|
+
|
2309
|
+
*x_ql = tile_x_qs;
|
2310
|
+
*x_dm = (half2 *) tile_x_d;
|
2311
|
+
}
|
2312
|
+
|
2313
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2314
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2315
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2316
|
+
|
2317
|
+
__builtin_assume(i_offset >= 0);
|
2318
|
+
__builtin_assume(i_offset < nwarps);
|
2319
|
+
__builtin_assume(k >= 0);
|
2320
|
+
__builtin_assume(k < WARP_SIZE);
|
2321
|
+
|
2322
|
+
const int kbx = k / QI8_0;
|
2323
|
+
const int kqsx = k % QI8_0;
|
2324
|
+
float * x_dmf = (float *) x_dm;
|
2325
|
+
|
2326
|
+
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2327
|
+
|
2328
|
+
#pragma unroll
|
2329
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2330
|
+
int i = i0 + i_offset;
|
2331
|
+
|
2332
|
+
if (need_check) {
|
2333
|
+
i = min(i, i_max);
|
2334
|
+
}
|
2335
|
+
|
2336
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
2337
|
+
|
2338
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
|
2339
|
+
}
|
2340
|
+
|
2341
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
|
2342
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2343
|
+
|
2344
|
+
#pragma unroll
|
2345
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
|
2346
|
+
int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2347
|
+
|
2348
|
+
if (need_check) {
|
2349
|
+
i = min(i, i_max);
|
2350
|
+
}
|
2351
|
+
|
2352
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2353
|
+
|
2354
|
+
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
|
2355
|
+
}
|
2356
|
+
}
|
2357
|
+
|
2358
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2359
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2360
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2361
|
+
|
2362
|
+
const float * x_dmf = (const float *) x_dm;
|
2363
|
+
const float * y_df = (const float *) y_ds;
|
2364
|
+
|
2365
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
|
2366
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
|
2367
|
+
y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
|
2368
|
+
}
|
2369
|
+
|
2370
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
2371
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2372
|
+
|
2373
|
+
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
|
2374
|
+
|
2375
|
+
const int bq8_offset = QR2_K * (iqs / QI8_1);
|
2376
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
2377
|
+
|
2378
|
+
const uint8_t * scales = bq2_K->scales + scale_offset;
|
2379
|
+
|
2380
|
+
const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
|
2381
|
+
int u[QR2_K];
|
2382
|
+
float d8[QR2_K];
|
2383
|
+
|
2384
|
+
#pragma unroll
|
2385
|
+
for (int i = 0; i < QR2_K; ++ i) {
|
2386
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2387
|
+
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2388
|
+
}
|
2389
|
+
|
2390
|
+
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
2391
|
+
}
|
2392
|
+
|
2393
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2394
|
+
|
2395
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2396
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
2397
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
2398
|
+
|
2399
|
+
*x_ql = tile_x_ql;
|
2400
|
+
*x_dm = tile_x_dm;
|
2401
|
+
*x_sc = tile_x_sc;
|
2402
|
+
}
|
2403
|
+
|
2404
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2405
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2406
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2407
|
+
|
2408
|
+
__builtin_assume(i_offset >= 0);
|
2409
|
+
__builtin_assume(i_offset < nwarps);
|
2410
|
+
__builtin_assume(k >= 0);
|
2411
|
+
__builtin_assume(k < WARP_SIZE);
|
2412
|
+
|
2413
|
+
const int kbx = k / QI2_K;
|
2414
|
+
const int kqsx = k % QI2_K;
|
2415
|
+
|
2416
|
+
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2417
|
+
|
2418
|
+
#pragma unroll
|
2419
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2420
|
+
int i = i0 + i_offset;
|
2421
|
+
|
2422
|
+
if (need_check) {
|
2423
|
+
i = min(i, i_max);
|
2424
|
+
}
|
2425
|
+
|
2426
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2427
|
+
|
2428
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2429
|
+
}
|
2430
|
+
|
2431
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
|
2432
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2433
|
+
|
2434
|
+
#pragma unroll
|
2435
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
|
2436
|
+
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
|
2437
|
+
|
2438
|
+
if (need_check) {
|
2439
|
+
i = min(i, i_max);
|
2440
|
+
}
|
2441
|
+
|
2442
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2443
|
+
|
2444
|
+
x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
|
2445
|
+
}
|
2446
|
+
|
2447
|
+
#pragma unroll
|
2448
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
2449
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2450
|
+
|
2451
|
+
if (need_check) {
|
2452
|
+
i = min(i, i_max);
|
2453
|
+
}
|
2454
|
+
|
2455
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
|
2456
|
+
|
2457
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
|
2458
|
+
}
|
2459
|
+
}
|
2460
|
+
|
2461
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
2462
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2463
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2464
|
+
|
2465
|
+
const int kbx = k / QI2_K;
|
2466
|
+
const int ky = (k % QI2_K) * QR2_K;
|
2467
|
+
const float * y_df = (const float *) y_ds;
|
2468
|
+
|
2469
|
+
int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
|
2470
|
+
|
2471
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
|
2472
|
+
const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
|
2473
|
+
|
2474
|
+
#pragma unroll
|
2475
|
+
for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
|
2476
|
+
v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
2477
|
+
}
|
2478
|
+
|
2479
|
+
const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
|
2480
|
+
|
2481
|
+
const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
|
2482
|
+
return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
|
2483
|
+
}
|
2484
|
+
|
2485
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
2486
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2487
|
+
|
2488
|
+
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
|
2489
|
+
|
2490
|
+
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
|
2491
|
+
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
|
2492
|
+
|
2493
|
+
const float d = bq3_K->d;
|
2494
|
+
|
2495
|
+
const int vl = get_int_from_uint8(bq3_K->qs, iqs);
|
2496
|
+
|
2497
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2498
|
+
const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
|
2499
|
+
|
2500
|
+
int u[QR3_K];
|
2501
|
+
float d8[QR3_K];
|
2502
|
+
|
2503
|
+
#pragma unroll
|
2504
|
+
for (int i = 0; i < QR3_K; ++i) {
|
2505
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2506
|
+
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2507
|
+
}
|
2508
|
+
|
2509
|
+
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
2510
|
+
}
|
2511
|
+
|
2512
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2513
|
+
|
2514
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2515
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
|
2516
|
+
__shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
|
2517
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
2518
|
+
|
2519
|
+
*x_ql = tile_x_ql;
|
2520
|
+
*x_dm = tile_x_dm;
|
2521
|
+
*x_qh = tile_x_qh;
|
2522
|
+
*x_sc = tile_x_sc;
|
2523
|
+
}
|
2524
|
+
|
2525
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2526
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2527
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2528
|
+
|
2529
|
+
__builtin_assume(i_offset >= 0);
|
2530
|
+
__builtin_assume(i_offset < nwarps);
|
2531
|
+
__builtin_assume(k >= 0);
|
2532
|
+
__builtin_assume(k < WARP_SIZE);
|
2533
|
+
|
2534
|
+
const int kbx = k / QI3_K;
|
2535
|
+
const int kqsx = k % QI3_K;
|
2536
|
+
|
2537
|
+
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2538
|
+
|
2539
|
+
#pragma unroll
|
2540
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2541
|
+
int i = i0 + i_offset;
|
2542
|
+
|
2543
|
+
if (need_check) {
|
2544
|
+
i = min(i, i_max);
|
2545
|
+
}
|
2546
|
+
|
2547
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2548
|
+
|
2549
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
2550
|
+
}
|
2551
|
+
|
2552
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
|
2553
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2554
|
+
float * x_dmf = (float *) x_dm;
|
2555
|
+
|
2556
|
+
#pragma unroll
|
2557
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
|
2558
|
+
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
|
2559
|
+
|
2560
|
+
if (need_check) {
|
2561
|
+
i = min(i, i_max);
|
2562
|
+
}
|
2563
|
+
|
2564
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2565
|
+
|
2566
|
+
x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
|
2567
|
+
}
|
2568
|
+
|
2569
|
+
#pragma unroll
|
2570
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
|
2571
|
+
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
2572
|
+
|
2573
|
+
if (need_check) {
|
2574
|
+
i = min(i, i_max);
|
2575
|
+
}
|
2576
|
+
|
2577
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
|
2578
|
+
|
2579
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2580
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
|
2581
|
+
}
|
2582
|
+
|
2583
|
+
#pragma unroll
|
2584
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
2585
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2586
|
+
|
2587
|
+
if (need_check) {
|
2588
|
+
i = min(i, i_max);
|
2589
|
+
}
|
2590
|
+
|
2591
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
|
2592
|
+
|
2593
|
+
const int ksc = k % (QI3_K/4);
|
2594
|
+
|
2595
|
+
const int ksc_low = ksc % (QI3_K/8);
|
2596
|
+
const int shift_low = 4 * (ksc / (QI3_K/8));
|
2597
|
+
const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
|
2598
|
+
|
2599
|
+
const int ksc_high = QI3_K/8;
|
2600
|
+
const int shift_high = 2 * ksc;
|
2601
|
+
const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
|
2602
|
+
|
2603
|
+
const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
|
2604
|
+
|
2605
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
|
2606
|
+
}
|
2607
|
+
}
|
2608
|
+
|
2609
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
2610
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2611
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2612
|
+
|
2613
|
+
const int kbx = k / QI3_K;
|
2614
|
+
const int ky = (k % QI3_K) * QR3_K;
|
2615
|
+
const float * x_dmf = (const float *) x_dm;
|
2616
|
+
const float * y_df = (const float *) y_ds;
|
2617
|
+
|
2618
|
+
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2619
|
+
|
2620
|
+
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2621
|
+
|
2622
|
+
#pragma unroll
|
2623
|
+
for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
|
2624
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
|
2625
|
+
const int shift = 2 * ((ky % 32) / 8);
|
2626
|
+
const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
2627
|
+
|
2628
|
+
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
|
2629
|
+
const int vlh = (vh << 2) & 0x04040404;
|
2630
|
+
|
2631
|
+
v[l] = __vsubss4(vll, vlh);
|
2632
|
+
}
|
2633
|
+
|
2634
|
+
const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
|
2635
|
+
return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
|
2636
|
+
}
|
2637
|
+
|
2638
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
2639
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2640
|
+
|
2641
|
+
#ifndef GGML_QKK_64
|
2642
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
2643
|
+
|
2644
|
+
int v[2];
|
2645
|
+
int u[2*QR4_K];
|
2646
|
+
float d8[QR4_K];
|
2647
|
+
|
2648
|
+
// iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
|
2649
|
+
const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
|
2650
|
+
|
2651
|
+
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
2652
|
+
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
2653
|
+
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
2654
|
+
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
2655
|
+
|
2656
|
+
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
|
2657
|
+
v[0] = q4[0];
|
2658
|
+
v[1] = q4[4];
|
2659
|
+
|
2660
|
+
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
2661
|
+
uint16_t aux[2];
|
2662
|
+
const int j = bq8_offset/2;
|
2663
|
+
if (j < 2) {
|
2664
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
2665
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
2666
|
+
} else {
|
2667
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
2668
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
2669
|
+
}
|
2670
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2671
|
+
const uint8_t * m = sc + 2;
|
2672
|
+
|
2673
|
+
for (int i = 0; i < QR4_K; ++i) {
|
2674
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2675
|
+
d8[i] = bq8i->ds.x;
|
2676
|
+
|
2677
|
+
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2678
|
+
u[2*i+0] = q8[0];
|
2679
|
+
u[2*i+1] = q8[4];
|
2680
|
+
}
|
2681
|
+
|
2682
|
+
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
2683
|
+
|
2684
|
+
#else
|
2685
|
+
|
2686
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2687
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
2688
|
+
|
2689
|
+
float sumf_d = 0.0f;
|
2690
|
+
float sumf_m = 0.0f;
|
2691
|
+
|
2692
|
+
uint16_t aux16[2];
|
2693
|
+
const uint8_t * s = (const uint8_t *)aux16;
|
2694
|
+
|
2695
|
+
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
2696
|
+
aux16[0] = a[0] & 0x0f0f;
|
2697
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2698
|
+
|
2699
|
+
const float dall = bq4_K->d[0];
|
2700
|
+
const float dmin = bq4_K->d[1];
|
2701
|
+
|
2702
|
+
const float d8_1 = bq8_1[0].ds.x;
|
2703
|
+
const float d8_2 = bq8_1[1].ds.x;
|
2704
|
+
|
2705
|
+
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2706
|
+
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
2707
|
+
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
2708
|
+
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
2709
|
+
|
2710
|
+
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
2711
|
+
const int v1 = q4[0];
|
2712
|
+
const int v2 = q4[4];
|
2713
|
+
|
2714
|
+
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
2715
|
+
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
2716
|
+
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
2717
|
+
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
2718
|
+
|
2719
|
+
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
2720
|
+
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
2721
|
+
|
2722
|
+
return dall * sumf_d - dmin * sumf_m;
|
2723
|
+
|
2724
|
+
#else
|
2725
|
+
return 0.0f; // only to satisfy the compiler
|
2726
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2727
|
+
|
2728
|
+
#endif
|
2729
|
+
}
|
2730
|
+
|
2731
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2732
|
+
|
2733
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2734
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
2735
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2736
|
+
|
2737
|
+
*x_ql = tile_x_ql;
|
2738
|
+
*x_dm = tile_x_dm;
|
2739
|
+
*x_sc = tile_x_sc;
|
2740
|
+
}
|
2741
|
+
|
2742
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2743
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2744
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2745
|
+
|
2746
|
+
__builtin_assume(i_offset >= 0);
|
2747
|
+
__builtin_assume(i_offset < nwarps);
|
2748
|
+
__builtin_assume(k >= 0);
|
2749
|
+
__builtin_assume(k < WARP_SIZE);
|
2750
|
+
|
2751
|
+
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
2752
|
+
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
2753
|
+
|
2754
|
+
const block_q4_K * bx0 = (block_q4_K *) vx;
|
2755
|
+
|
2756
|
+
#pragma unroll
|
2757
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2758
|
+
int i = i0 + i_offset;
|
2759
|
+
|
2760
|
+
if (need_check) {
|
2761
|
+
i = min(i, i_max);
|
2762
|
+
}
|
2763
|
+
|
2764
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2765
|
+
|
2766
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2767
|
+
}
|
2768
|
+
|
2769
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
2770
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2771
|
+
|
2772
|
+
#pragma unroll
|
2773
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
|
2774
|
+
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
|
2775
|
+
|
2776
|
+
if (need_check) {
|
2777
|
+
i = min(i, i_max);
|
2778
|
+
}
|
2779
|
+
|
2780
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2781
|
+
|
2782
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2783
|
+
}
|
2784
|
+
|
2785
|
+
#pragma unroll
|
2786
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
2787
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
2788
|
+
|
2789
|
+
if (need_check) {
|
2790
|
+
i = min(i, i_max);
|
2791
|
+
}
|
2792
|
+
|
2793
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
2794
|
+
|
2795
|
+
const int * scales = (int *) bxi->scales;
|
2796
|
+
|
2797
|
+
const int ksc = k % (WARP_SIZE/8);
|
2798
|
+
|
2799
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
2800
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
2801
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
2802
|
+
|
2803
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
2804
|
+
}
|
2805
|
+
}
|
2806
|
+
|
2807
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2808
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2809
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2810
|
+
|
2811
|
+
int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
|
2812
|
+
|
2813
|
+
#pragma unroll
|
2814
|
+
for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
|
2815
|
+
v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
|
2816
|
+
v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
|
2817
|
+
}
|
2818
|
+
|
2819
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
2820
|
+
|
2821
|
+
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
2822
|
+
return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
2823
|
+
}
|
2824
|
+
|
2825
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
2826
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
2827
|
+
|
2828
|
+
#ifndef GGML_QKK_64
|
2829
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
2830
|
+
|
2831
|
+
int vl[2];
|
2832
|
+
int vh[2];
|
2833
|
+
int u[2*QR5_K];
|
2834
|
+
float d8[QR5_K];
|
2835
|
+
|
2836
|
+
const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
|
2837
|
+
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
|
2838
|
+
const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
|
2839
|
+
|
2840
|
+
vl[0] = ql[0];
|
2841
|
+
vl[1] = ql[4];
|
2842
|
+
|
2843
|
+
vh[0] = qh[0] >> bq8_offset;
|
2844
|
+
vh[1] = qh[4] >> bq8_offset;
|
2845
|
+
|
2846
|
+
const uint16_t * scales = (const uint16_t *)bq5_K->scales;
|
2847
|
+
uint16_t aux[2];
|
2848
|
+
const int j = bq8_offset/2;
|
2849
|
+
if (j < 2) {
|
2850
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
2851
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
2852
|
+
} else {
|
2853
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
2854
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
2855
|
+
}
|
2856
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
2857
|
+
const uint8_t * m = sc + 2;
|
2858
|
+
|
2859
|
+
#pragma unroll
|
2860
|
+
for (int i = 0; i < QR5_K; ++i) {
|
2861
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2862
|
+
d8[i] = bq8i->ds.x;
|
2863
|
+
|
2864
|
+
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2865
|
+
u[2*i+0] = q8[0];
|
2866
|
+
u[2*i+1] = q8[4];
|
2867
|
+
}
|
2868
|
+
|
2869
|
+
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
|
2870
|
+
|
2871
|
+
#else
|
2872
|
+
|
2873
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2874
|
+
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
2875
|
+
|
2876
|
+
const int8_t * s = bq5_K->scales;
|
2877
|
+
|
2878
|
+
const float d = bq5_K->d;
|
2879
|
+
|
2880
|
+
const float d8_1 = bq8_1[0].ds.x;
|
2881
|
+
const float d8_2 = bq8_1[1].ds.x;
|
2882
|
+
|
2883
|
+
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2884
|
+
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
2885
|
+
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
2886
|
+
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
2887
|
+
|
2888
|
+
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
2889
|
+
const int vl1 = ql[0];
|
2890
|
+
const int vl2 = ql[4];
|
2891
|
+
|
2892
|
+
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
2893
|
+
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
2894
|
+
const int in = step%8; // 0, 4, 0, 4
|
2895
|
+
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
2896
|
+
|
2897
|
+
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
2898
|
+
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
2899
|
+
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
2900
|
+
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
2901
|
+
|
2902
|
+
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
2903
|
+
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
2904
|
+
|
2905
|
+
return d * sumf_d;
|
2906
|
+
|
2907
|
+
#else
|
2908
|
+
return 0.0f; // only to satisfy the compiler
|
2909
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2910
|
+
|
2911
|
+
#endif
|
2912
|
+
}
|
2913
|
+
|
2914
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2915
|
+
|
2916
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2917
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
2918
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2919
|
+
|
2920
|
+
*x_ql = tile_x_ql;
|
2921
|
+
*x_dm = tile_x_dm;
|
2922
|
+
*x_sc = tile_x_sc;
|
2923
|
+
}
|
2924
|
+
|
2925
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2926
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2927
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2928
|
+
|
2929
|
+
__builtin_assume(i_offset >= 0);
|
2930
|
+
__builtin_assume(i_offset < nwarps);
|
2931
|
+
__builtin_assume(k >= 0);
|
2932
|
+
__builtin_assume(k < WARP_SIZE);
|
2933
|
+
|
2934
|
+
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
2935
|
+
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
2936
|
+
|
2937
|
+
const block_q5_K * bx0 = (block_q5_K *) vx;
|
2938
|
+
|
2939
|
+
#pragma unroll
|
2940
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2941
|
+
int i = i0 + i_offset;
|
2942
|
+
|
2943
|
+
if (need_check) {
|
2944
|
+
i = min(i, i_max);
|
2945
|
+
}
|
2946
|
+
|
2947
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2948
|
+
const int ky = QR5_K*kqsx;
|
2949
|
+
|
2950
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2951
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
2952
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
2953
|
+
|
2954
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
|
2955
|
+
const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
|
2956
|
+
const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
|
2957
|
+
|
2958
|
+
const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
|
2959
|
+
const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
|
2960
|
+
|
2961
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
|
2962
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
|
2963
|
+
}
|
2964
|
+
|
2965
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
2966
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2967
|
+
|
2968
|
+
#pragma unroll
|
2969
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
|
2970
|
+
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
|
2971
|
+
|
2972
|
+
if (need_check) {
|
2973
|
+
i = min(i, i_max);
|
2974
|
+
}
|
2975
|
+
|
2976
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2977
|
+
|
2978
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
2979
|
+
}
|
2980
|
+
|
2981
|
+
#pragma unroll
|
2982
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
2983
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
2984
|
+
|
2985
|
+
if (need_check) {
|
2986
|
+
i = min(i, i_max);
|
2987
|
+
}
|
2988
|
+
|
2989
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
2990
|
+
|
2991
|
+
const int * scales = (int *) bxi->scales;
|
2992
|
+
|
2993
|
+
const int ksc = k % (WARP_SIZE/8);
|
2994
|
+
|
2995
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
2996
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
2997
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
2998
|
+
|
2999
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
3000
|
+
}
|
3001
|
+
}
|
3002
|
+
|
3003
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
3004
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3005
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3006
|
+
|
3007
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
3008
|
+
|
3009
|
+
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
3010
|
+
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
3011
|
+
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
3012
|
+
}
|
3013
|
+
|
3014
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
3015
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
3016
|
+
|
3017
|
+
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
3018
|
+
|
3019
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
|
3020
|
+
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
3021
|
+
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
3022
|
+
|
3023
|
+
const int vl = get_int_from_uint8(bq6_K->ql, iqs);
|
3024
|
+
const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
|
3025
|
+
|
3026
|
+
const int8_t * scales = bq6_K->scales + scale_offset;
|
3027
|
+
|
3028
|
+
int u[QR6_K];
|
3029
|
+
float d8[QR6_K];
|
3030
|
+
|
3031
|
+
#pragma unroll
|
3032
|
+
for (int i = 0; i < QR6_K; ++i) {
|
3033
|
+
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
3034
|
+
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
3035
|
+
}
|
3036
|
+
|
3037
|
+
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
3038
|
+
}
|
3039
|
+
|
3040
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3041
|
+
|
3042
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3043
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
3044
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
3045
|
+
|
3046
|
+
*x_ql = tile_x_ql;
|
3047
|
+
*x_dm = tile_x_dm;
|
3048
|
+
*x_sc = tile_x_sc;
|
3049
|
+
}
|
3050
|
+
|
3051
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3052
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3053
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3054
|
+
|
3055
|
+
__builtin_assume(i_offset >= 0);
|
3056
|
+
__builtin_assume(i_offset < nwarps);
|
3057
|
+
__builtin_assume(k >= 0);
|
3058
|
+
__builtin_assume(k < WARP_SIZE);
|
3059
|
+
|
3060
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3061
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3062
|
+
|
3063
|
+
const block_q6_K * bx0 = (block_q6_K *) vx;
|
3064
|
+
|
3065
|
+
#pragma unroll
|
3066
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
3067
|
+
int i = i0 + i_offset;
|
3068
|
+
|
3069
|
+
if (need_check) {
|
3070
|
+
i = min(i, i_max);
|
3071
|
+
}
|
3072
|
+
|
3073
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
|
3074
|
+
const int ky = QR6_K*kqsx;
|
3075
|
+
|
3076
|
+
const int ql = get_int_from_uint8(bxi->ql, kqsx);
|
3077
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
3078
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
3079
|
+
|
3080
|
+
const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
|
3081
|
+
const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
|
3082
|
+
const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
|
3083
|
+
|
3084
|
+
const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
|
3085
|
+
const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
|
3086
|
+
|
3087
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
|
3088
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
|
3089
|
+
}
|
3090
|
+
|
3091
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
3092
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
3093
|
+
float * x_dmf = (float *) x_dm;
|
3094
|
+
|
3095
|
+
#pragma unroll
|
3096
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
|
3097
|
+
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
|
3098
|
+
|
3099
|
+
if (need_check) {
|
3100
|
+
i = min(i, i_max);
|
3101
|
+
}
|
3102
|
+
|
3103
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3104
|
+
|
3105
|
+
x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
|
3106
|
+
}
|
3107
|
+
|
3108
|
+
#pragma unroll
|
3109
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
3110
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
3111
|
+
|
3112
|
+
if (need_check) {
|
3113
|
+
i = min(i, i_max);
|
3114
|
+
}
|
3115
|
+
|
3116
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
|
3117
|
+
|
3118
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
|
3119
|
+
}
|
3120
|
+
}
|
3121
|
+
|
3122
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
3123
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3124
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3125
|
+
|
3126
|
+
const float * x_dmf = (const float *) x_dm;
|
3127
|
+
const float * y_df = (const float *) y_ds;
|
3128
|
+
|
3129
|
+
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
|
3130
|
+
|
3131
|
+
const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
|
3132
|
+
const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
|
3133
|
+
return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
|
3134
|
+
}
|
3135
|
+
|
3136
|
+
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
3137
|
+
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3138
|
+
static __global__ void mul_mat_q(
|
3139
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3140
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3141
|
+
|
3142
|
+
const block_q_t * x = (const block_q_t *) vx;
|
3143
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
3144
|
+
|
3145
|
+
const int blocks_per_row_x = ncols_x / qk;
|
3146
|
+
const int blocks_per_col_y = nrows_y / QK8_1;
|
3147
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
3148
|
+
|
3149
|
+
const int & ncols_dst = ncols_y;
|
3150
|
+
|
3151
|
+
const int row_dst_0 = blockIdx.x*mmq_y;
|
3152
|
+
const int & row_x_0 = row_dst_0;
|
3153
|
+
const int row_dst = row_dst_0 + threadIdx.x;
|
3154
|
+
|
3155
|
+
const int col_dst_0 = blockIdx.y*mmq_x;
|
3156
|
+
const int & col_y_0 = col_dst_0;
|
3157
|
+
|
3158
|
+
int * tile_x_ql = nullptr;
|
3159
|
+
half2 * tile_x_dm = nullptr;
|
3160
|
+
int * tile_x_qh = nullptr;
|
3161
|
+
int * tile_x_sc = nullptr;
|
3162
|
+
|
3163
|
+
allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
|
3164
|
+
|
3165
|
+
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3166
|
+
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
1767
3167
|
|
1768
|
-
|
1769
|
-
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
|
3168
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
1770
3169
|
|
1771
|
-
|
1772
|
-
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
|
1773
|
-
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
|
3170
|
+
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
1774
3171
|
|
1775
|
-
|
3172
|
+
load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
|
3173
|
+
threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
|
3174
|
+
|
3175
|
+
#pragma unroll
|
3176
|
+
for (int ir = 0; ir < qr; ++ir) {
|
3177
|
+
const int kqs = ir*WARP_SIZE + threadIdx.x;
|
3178
|
+
const int kbxd = kqs / QI8_1;
|
1776
3179
|
|
1777
|
-
|
3180
|
+
#pragma unroll
|
3181
|
+
for (int i = 0; i < mmq_x; i += nwarps) {
|
3182
|
+
const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
|
1778
3183
|
|
1779
|
-
|
1780
|
-
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
|
3184
|
+
const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
|
1781
3185
|
|
1782
|
-
|
1783
|
-
|
3186
|
+
const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
|
3187
|
+
tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
|
3188
|
+
}
|
1784
3189
|
|
1785
|
-
|
1786
|
-
|
3190
|
+
#pragma unroll
|
3191
|
+
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
|
3192
|
+
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
|
3193
|
+
const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
|
3194
|
+
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
3195
|
+
|
3196
|
+
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
|
3197
|
+
const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
|
3198
|
+
half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
|
3199
|
+
if (need_sum) {
|
3200
|
+
*dsi_dst = *dsi_src;
|
3201
|
+
} else {
|
3202
|
+
float * dfi_dst = (float *) dsi_dst;
|
3203
|
+
*dfi_dst = (*dsi_src).x;
|
3204
|
+
}
|
3205
|
+
}
|
1787
3206
|
|
1788
|
-
|
1789
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
|
1790
|
-
const float d8i = bq8i->d;
|
3207
|
+
__syncthreads();
|
1791
3208
|
|
1792
|
-
|
3209
|
+
// #pragma unroll // unrolling this loop causes too much register pressure
|
3210
|
+
for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
|
3211
|
+
#pragma unroll
|
3212
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
3213
|
+
#pragma unroll
|
3214
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3215
|
+
sum[i/WARP_SIZE][j/nwarps] += vec_dot(
|
3216
|
+
tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
|
3217
|
+
threadIdx.x + i, threadIdx.y + j, k);
|
3218
|
+
}
|
3219
|
+
}
|
3220
|
+
}
|
1793
3221
|
|
1794
|
-
|
3222
|
+
__syncthreads();
|
3223
|
+
}
|
3224
|
+
}
|
1795
3225
|
|
1796
|
-
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1797
3226
|
|
1798
|
-
|
3227
|
+
if (row_dst >= nrows_dst) {
|
3228
|
+
return;
|
1799
3229
|
}
|
1800
3230
|
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
3231
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
3232
|
+
const int col_dst = col_dst_0 + j + threadIdx.y;
|
3233
|
+
|
3234
|
+
if (col_dst >= ncols_dst) {
|
3235
|
+
return;
|
3236
|
+
}
|
3237
|
+
|
3238
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3239
|
+
dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
|
3240
|
+
}
|
3241
|
+
}
|
1805
3242
|
}
|
1806
3243
|
|
1807
|
-
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
3244
|
+
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
1808
3245
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
1809
3246
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
1810
3247
|
|
@@ -1813,7 +3250,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1813
3250
|
}
|
1814
3251
|
|
1815
3252
|
const int blocks_per_row = ncols / qk;
|
1816
|
-
const int blocks_per_warp = WARP_SIZE / qi;
|
3253
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
1817
3254
|
|
1818
3255
|
// partial sum for each thread
|
1819
3256
|
float tmp = 0.0f;
|
@@ -1822,11 +3259,11 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|
1822
3259
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
1823
3260
|
|
1824
3261
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
1825
|
-
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
3262
|
+
const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
|
1826
3263
|
|
1827
|
-
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
|
3264
|
+
const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
|
1828
3265
|
|
1829
|
-
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
3266
|
+
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
|
1830
3267
|
|
1831
3268
|
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
1832
3269
|
}
|
@@ -1859,11 +3296,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1859
3296
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
1860
3297
|
|
1861
3298
|
// partial sum for each thread
|
1862
|
-
#ifdef
|
3299
|
+
#ifdef GGML_CUDA_F16
|
1863
3300
|
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
1864
3301
|
#else
|
1865
3302
|
float tmp = 0.0f;
|
1866
|
-
#endif //
|
3303
|
+
#endif // GGML_CUDA_F16
|
1867
3304
|
|
1868
3305
|
for (int i = 0; i < ncols; i += iter_stride) {
|
1869
3306
|
const int col = i + vals_per_iter*tid;
|
@@ -1883,7 +3320,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1883
3320
|
|
1884
3321
|
// matrix multiplication
|
1885
3322
|
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
1886
|
-
#ifdef
|
3323
|
+
#ifdef GGML_CUDA_F16
|
1887
3324
|
tmp += __hmul2(v, {
|
1888
3325
|
y[iybs + iqs + j/qr + 0],
|
1889
3326
|
y[iybs + iqs + j/qr + y_offset]
|
@@ -1891,7 +3328,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1891
3328
|
#else
|
1892
3329
|
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
1893
3330
|
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
1894
|
-
#endif //
|
3331
|
+
#endif // GGML_CUDA_F16
|
1895
3332
|
}
|
1896
3333
|
}
|
1897
3334
|
|
@@ -1902,11 +3339,11 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1902
3339
|
}
|
1903
3340
|
|
1904
3341
|
if (tid == 0) {
|
1905
|
-
#ifdef
|
3342
|
+
#ifdef GGML_CUDA_F16
|
1906
3343
|
dst[row] = tmp.x + tmp.y;
|
1907
3344
|
#else
|
1908
3345
|
dst[row] = tmp;
|
1909
|
-
#endif //
|
3346
|
+
#endif // GGML_CUDA_F16
|
1910
3347
|
}
|
1911
3348
|
}
|
1912
3349
|
|
@@ -2046,7 +3483,8 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
2046
3483
|
}
|
2047
3484
|
|
2048
3485
|
// rope == RoPE == rotary positional embedding
|
2049
|
-
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float
|
3486
|
+
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
|
3487
|
+
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
2050
3488
|
const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
2051
3489
|
|
2052
3490
|
if (col >= ncols) {
|
@@ -2056,7 +3494,7 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
2056
3494
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
2057
3495
|
const int i = row*ncols + col;
|
2058
3496
|
|
2059
|
-
const float theta =
|
3497
|
+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
2060
3498
|
const float sin_theta = sinf(theta);
|
2061
3499
|
const float cos_theta = cosf(theta);
|
2062
3500
|
|
@@ -2203,9 +3641,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
2203
3641
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
2204
3642
|
}
|
2205
3643
|
|
2206
|
-
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int
|
2207
|
-
const int
|
2208
|
-
|
3644
|
+
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
3645
|
+
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
3646
|
+
const dim3 num_blocks(block_num_x, ky, 1);
|
3647
|
+
const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
|
3648
|
+
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
2209
3649
|
}
|
2210
3650
|
|
2211
3651
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -2366,7 +3806,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2366
3806
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2367
3807
|
const dim3 block_nums(1, block_num_y, 1);
|
2368
3808
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2369
|
-
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
3809
|
+
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
2370
3810
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2371
3811
|
}
|
2372
3812
|
|
@@ -2375,7 +3815,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2375
3815
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2376
3816
|
const dim3 block_nums(1, block_num_y, 1);
|
2377
3817
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2378
|
-
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
3818
|
+
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
2379
3819
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2380
3820
|
}
|
2381
3821
|
|
@@ -2384,7 +3824,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2384
3824
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2385
3825
|
const dim3 block_nums(1, block_num_y, 1);
|
2386
3826
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2387
|
-
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
3827
|
+
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
2388
3828
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2389
3829
|
}
|
2390
3830
|
|
@@ -2393,7 +3833,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2393
3833
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2394
3834
|
const dim3 block_nums(1, block_num_y, 1);
|
2395
3835
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2396
|
-
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
3836
|
+
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
2397
3837
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2398
3838
|
}
|
2399
3839
|
|
@@ -2402,7 +3842,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2402
3842
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2403
3843
|
const dim3 block_nums(1, block_num_y, 1);
|
2404
3844
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2405
|
-
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
3845
|
+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
2406
3846
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2407
3847
|
}
|
2408
3848
|
|
@@ -2411,7 +3851,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2411
3851
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2412
3852
|
const dim3 block_nums(1, block_num_y, 1);
|
2413
3853
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2414
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
|
3854
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
2415
3855
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2416
3856
|
}
|
2417
3857
|
|
@@ -2420,7 +3860,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2420
3860
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2421
3861
|
const dim3 block_nums(1, block_num_y, 1);
|
2422
3862
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2423
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
|
3863
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
2424
3864
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2425
3865
|
}
|
2426
3866
|
|
@@ -2429,10 +3869,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2429
3869
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2430
3870
|
const dim3 block_nums(1, block_num_y, 1);
|
2431
3871
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2432
|
-
|
2433
|
-
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2434
|
-
// is better amortized.
|
2435
|
-
mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
|
3872
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
2436
3873
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2437
3874
|
}
|
2438
3875
|
|
@@ -2441,10 +3878,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2441
3878
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2442
3879
|
const dim3 block_nums(1, block_num_y, 1);
|
2443
3880
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2444
|
-
|
2445
|
-
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2446
|
-
// is better amortized.
|
2447
|
-
mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
|
3881
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
2448
3882
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2449
3883
|
}
|
2450
3884
|
|
@@ -2453,7 +3887,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2453
3887
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2454
3888
|
const dim3 block_nums(1, block_num_y, 1);
|
2455
3889
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2456
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
|
3890
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
2457
3891
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2458
3892
|
}
|
2459
3893
|
|
@@ -2500,6 +3934,537 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
2500
3934
|
}
|
2501
3935
|
}
|
2502
3936
|
|
3937
|
+
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
3938
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3939
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3940
|
+
|
3941
|
+
int id;
|
3942
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
3943
|
+
const int compute_capability = g_compute_capabilities[id];
|
3944
|
+
|
3945
|
+
if (compute_capability >= CC_TURING) {
|
3946
|
+
const int mmq_x = 64;
|
3947
|
+
const int mmq_y = 128;
|
3948
|
+
const int nwarps = 4;
|
3949
|
+
|
3950
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
3951
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3952
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3953
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3954
|
+
|
3955
|
+
if (nrows_x % mmq_y == 0) {
|
3956
|
+
const bool need_check = false;
|
3957
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3958
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3959
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3960
|
+
} else {
|
3961
|
+
const bool need_check = true;
|
3962
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3963
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3964
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3965
|
+
}
|
3966
|
+
} else {
|
3967
|
+
const int mmq_x = 64;
|
3968
|
+
const int mmq_y = 64;
|
3969
|
+
const int nwarps = 4;
|
3970
|
+
|
3971
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
3972
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3973
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3974
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3975
|
+
|
3976
|
+
if (nrows_x % mmq_y == 0) {
|
3977
|
+
const bool need_check = false;
|
3978
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3979
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3980
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3981
|
+
} else {
|
3982
|
+
const bool need_check = true;
|
3983
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3984
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3985
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
|
+
}
|
3987
|
+
}
|
3988
|
+
}
|
3989
|
+
|
3990
|
+
static void ggml_mul_mat_q4_1_q8_1_cuda(
|
3991
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3992
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3993
|
+
|
3994
|
+
int id;
|
3995
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
3996
|
+
const int compute_capability = g_compute_capabilities[id];
|
3997
|
+
|
3998
|
+
if (compute_capability >= CC_TURING) {
|
3999
|
+
const int mmq_x = 64;
|
4000
|
+
const int mmq_y = 128;
|
4001
|
+
const int nwarps = 4;
|
4002
|
+
|
4003
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4004
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4005
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4006
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4007
|
+
|
4008
|
+
if (nrows_x % mmq_y == 0) {
|
4009
|
+
const bool need_check = false;
|
4010
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4011
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4012
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4013
|
+
} else {
|
4014
|
+
const bool need_check = true;
|
4015
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4016
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4017
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4018
|
+
}
|
4019
|
+
} else {
|
4020
|
+
const int mmq_x = 64;
|
4021
|
+
const int mmq_y = 64;
|
4022
|
+
const int nwarps = 8;
|
4023
|
+
|
4024
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4025
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4026
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4027
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4028
|
+
|
4029
|
+
if (nrows_x % mmq_y == 0) {
|
4030
|
+
const bool need_check = false;
|
4031
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4032
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4033
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4034
|
+
} else {
|
4035
|
+
const bool need_check = true;
|
4036
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4037
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4038
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4039
|
+
}
|
4040
|
+
|
4041
|
+
}
|
4042
|
+
}
|
4043
|
+
|
4044
|
+
static void ggml_mul_mat_q5_0_q8_1_cuda(
|
4045
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4046
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4047
|
+
|
4048
|
+
int id;
|
4049
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4050
|
+
const int compute_capability = g_compute_capabilities[id];
|
4051
|
+
|
4052
|
+
if (compute_capability >= CC_TURING) {
|
4053
|
+
const int mmq_x = 128;
|
4054
|
+
const int mmq_y = 64;
|
4055
|
+
const int nwarps = 4;
|
4056
|
+
|
4057
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4058
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4059
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4060
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4061
|
+
|
4062
|
+
if (nrows_x % mmq_y == 0) {
|
4063
|
+
const bool need_check = false;
|
4064
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4065
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4066
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4067
|
+
} else {
|
4068
|
+
const bool need_check = true;
|
4069
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4070
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4071
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4072
|
+
}
|
4073
|
+
} else {
|
4074
|
+
const int mmq_x = 64;
|
4075
|
+
const int mmq_y = 64;
|
4076
|
+
const int nwarps = 8;
|
4077
|
+
|
4078
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4079
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4080
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4081
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4082
|
+
|
4083
|
+
if (nrows_x % mmq_y == 0) {
|
4084
|
+
const bool need_check = false;
|
4085
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4086
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4087
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4088
|
+
} else {
|
4089
|
+
const bool need_check = true;
|
4090
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4091
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4092
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4093
|
+
}
|
4094
|
+
}
|
4095
|
+
}
|
4096
|
+
|
4097
|
+
static void ggml_mul_mat_q5_1_q8_1_cuda(
|
4098
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4099
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4100
|
+
|
4101
|
+
int id;
|
4102
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4103
|
+
const int compute_capability = g_compute_capabilities[id];
|
4104
|
+
|
4105
|
+
if (compute_capability >= CC_TURING) {
|
4106
|
+
const int mmq_x = 128;
|
4107
|
+
const int mmq_y = 64;
|
4108
|
+
const int nwarps = 8;
|
4109
|
+
|
4110
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4111
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4112
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4113
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4114
|
+
|
4115
|
+
if (nrows_x % mmq_y == 0) {
|
4116
|
+
const bool need_check = false;
|
4117
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4118
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4119
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4120
|
+
} else {
|
4121
|
+
const bool need_check = true;
|
4122
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4123
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4124
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4125
|
+
}
|
4126
|
+
} else {
|
4127
|
+
const int mmq_x = 64;
|
4128
|
+
const int mmq_y = 64;
|
4129
|
+
const int nwarps = 8;
|
4130
|
+
|
4131
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4132
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4133
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4134
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4135
|
+
|
4136
|
+
if (nrows_x % mmq_y == 0) {
|
4137
|
+
const bool need_check = false;
|
4138
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4139
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4140
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4141
|
+
} else {
|
4142
|
+
const bool need_check = true;
|
4143
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4144
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4145
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4146
|
+
}
|
4147
|
+
}
|
4148
|
+
}
|
4149
|
+
|
4150
|
+
static void ggml_mul_mat_q8_0_q8_1_cuda(
|
4151
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4152
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4153
|
+
|
4154
|
+
int id;
|
4155
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4156
|
+
const int compute_capability = g_compute_capabilities[id];
|
4157
|
+
|
4158
|
+
if (compute_capability >= CC_TURING) {
|
4159
|
+
const int mmq_x = 128;
|
4160
|
+
const int mmq_y = 64;
|
4161
|
+
const int nwarps = 4;
|
4162
|
+
|
4163
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4164
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4165
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4166
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4167
|
+
|
4168
|
+
if (nrows_x % mmq_y == 0) {
|
4169
|
+
const bool need_check = false;
|
4170
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4171
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4172
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4173
|
+
} else {
|
4174
|
+
const bool need_check = true;
|
4175
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4176
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4177
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4178
|
+
}
|
4179
|
+
} else {
|
4180
|
+
const int mmq_x = 64;
|
4181
|
+
const int mmq_y = 64;
|
4182
|
+
const int nwarps = 8;
|
4183
|
+
|
4184
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4185
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4186
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4187
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4188
|
+
|
4189
|
+
if (nrows_x % mmq_y == 0) {
|
4190
|
+
const bool need_check = false;
|
4191
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4192
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4193
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4194
|
+
} else {
|
4195
|
+
const bool need_check = true;
|
4196
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4197
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4198
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4199
|
+
}
|
4200
|
+
}
|
4201
|
+
}
|
4202
|
+
|
4203
|
+
static void ggml_mul_mat_q2_K_q8_1_cuda(
|
4204
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4205
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4206
|
+
|
4207
|
+
int id;
|
4208
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4209
|
+
const int compute_capability = g_compute_capabilities[id];
|
4210
|
+
|
4211
|
+
if (compute_capability >= CC_TURING) {
|
4212
|
+
const int mmq_x = 64;
|
4213
|
+
const int mmq_y = 128;
|
4214
|
+
const int nwarps = 4;
|
4215
|
+
|
4216
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4217
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4218
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4219
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4220
|
+
|
4221
|
+
if (nrows_x % mmq_y == 0) {
|
4222
|
+
const bool need_check = false;
|
4223
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4224
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4225
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4226
|
+
} else {
|
4227
|
+
const bool need_check = true;
|
4228
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4229
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4230
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4231
|
+
}
|
4232
|
+
} else {
|
4233
|
+
const int mmq_x = 64;
|
4234
|
+
const int mmq_y = 64;
|
4235
|
+
const int nwarps = 8;
|
4236
|
+
|
4237
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4238
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4239
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4240
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4241
|
+
|
4242
|
+
if (nrows_x % mmq_y == 0) {
|
4243
|
+
const bool need_check = false;
|
4244
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4245
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4246
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4247
|
+
} else {
|
4248
|
+
const bool need_check = true;
|
4249
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4250
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4251
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4252
|
+
}
|
4253
|
+
}
|
4254
|
+
}
|
4255
|
+
|
4256
|
+
static void ggml_mul_mat_q3_K_q8_1_cuda(
|
4257
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4258
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4259
|
+
|
4260
|
+
int id;
|
4261
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4262
|
+
const int compute_capability = g_compute_capabilities[id];
|
4263
|
+
|
4264
|
+
if (compute_capability >= CC_TURING) {
|
4265
|
+
const int mmq_x = 128;
|
4266
|
+
const int mmq_y = 128;
|
4267
|
+
const int nwarps = 4;
|
4268
|
+
|
4269
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4270
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4271
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4272
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4273
|
+
|
4274
|
+
if (nrows_x % mmq_y == 0) {
|
4275
|
+
const bool need_check = false;
|
4276
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4277
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4278
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4279
|
+
} else {
|
4280
|
+
const bool need_check = true;
|
4281
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4282
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4283
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4284
|
+
}
|
4285
|
+
} else {
|
4286
|
+
const int mmq_x = 64;
|
4287
|
+
const int mmq_y = 64;
|
4288
|
+
const int nwarps = 8;
|
4289
|
+
|
4290
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4291
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4292
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4293
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4294
|
+
|
4295
|
+
if (nrows_x % mmq_y == 0) {
|
4296
|
+
const bool need_check = false;
|
4297
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4298
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4299
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4300
|
+
} else {
|
4301
|
+
const bool need_check = true;
|
4302
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4303
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4304
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4305
|
+
}
|
4306
|
+
}
|
4307
|
+
}
|
4308
|
+
|
4309
|
+
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
4310
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4311
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4312
|
+
|
4313
|
+
int id;
|
4314
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4315
|
+
const int compute_capability = g_compute_capabilities[id];
|
4316
|
+
|
4317
|
+
if (compute_capability >= CC_TURING) {
|
4318
|
+
const int mmq_x = 64;
|
4319
|
+
const int mmq_y = 128;
|
4320
|
+
const int nwarps = 4;
|
4321
|
+
|
4322
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4323
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4324
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4325
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4326
|
+
|
4327
|
+
if (nrows_x % mmq_y == 0) {
|
4328
|
+
const bool need_check = false;
|
4329
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4330
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4331
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4332
|
+
} else {
|
4333
|
+
const bool need_check = true;
|
4334
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4335
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4336
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4337
|
+
}
|
4338
|
+
} else {
|
4339
|
+
const int mmq_x = 32;
|
4340
|
+
const int mmq_y = 64;
|
4341
|
+
const int nwarps = 8;
|
4342
|
+
|
4343
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4344
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4345
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4346
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4347
|
+
|
4348
|
+
if (nrows_x % mmq_y == 0) {
|
4349
|
+
const bool need_check = false;
|
4350
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4351
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4352
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4353
|
+
} else {
|
4354
|
+
const bool need_check = true;
|
4355
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4356
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4357
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4358
|
+
}
|
4359
|
+
}
|
4360
|
+
}
|
4361
|
+
|
4362
|
+
static void ggml_mul_mat_q5_K_q8_1_cuda(
|
4363
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4364
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4365
|
+
|
4366
|
+
int id;
|
4367
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4368
|
+
const int compute_capability = g_compute_capabilities[id];
|
4369
|
+
|
4370
|
+
if (compute_capability >= CC_TURING) {
|
4371
|
+
const int mmq_x = 64;
|
4372
|
+
const int mmq_y = 128;
|
4373
|
+
const int nwarps = 4;
|
4374
|
+
|
4375
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4376
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4377
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4378
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4379
|
+
|
4380
|
+
if (nrows_x % mmq_y == 0) {
|
4381
|
+
const bool need_check = false;
|
4382
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4383
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4384
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4385
|
+
} else {
|
4386
|
+
const bool need_check = true;
|
4387
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4388
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4389
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4390
|
+
}
|
4391
|
+
} else {
|
4392
|
+
const int mmq_x = 64;
|
4393
|
+
const int mmq_y = 64;
|
4394
|
+
const int nwarps = 8;
|
4395
|
+
|
4396
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4397
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4398
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4399
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4400
|
+
|
4401
|
+
if (nrows_x % mmq_y == 0) {
|
4402
|
+
const bool need_check = false;
|
4403
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4404
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4405
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4406
|
+
} else {
|
4407
|
+
const bool need_check = true;
|
4408
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4409
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4410
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4411
|
+
}
|
4412
|
+
}
|
4413
|
+
}
|
4414
|
+
|
4415
|
+
static void ggml_mul_mat_q6_K_q8_1_cuda(
|
4416
|
+
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4417
|
+
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4418
|
+
|
4419
|
+
int id;
|
4420
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4421
|
+
const int compute_capability = g_compute_capabilities[id];
|
4422
|
+
|
4423
|
+
if (compute_capability >= CC_TURING) {
|
4424
|
+
const int mmq_x = 64;
|
4425
|
+
const int mmq_y = 64;
|
4426
|
+
const int nwarps = 4;
|
4427
|
+
|
4428
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4429
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4430
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4431
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4432
|
+
|
4433
|
+
if (nrows_x % mmq_y == 0) {
|
4434
|
+
const bool need_check = false;
|
4435
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4436
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4437
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4438
|
+
} else {
|
4439
|
+
const bool need_check = true;
|
4440
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4441
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4442
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4443
|
+
}
|
4444
|
+
} else {
|
4445
|
+
const int mmq_x = 32;
|
4446
|
+
const int mmq_y = 64;
|
4447
|
+
const int nwarps = 8;
|
4448
|
+
|
4449
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4450
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4451
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4452
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4453
|
+
|
4454
|
+
if (nrows_x % mmq_y == 0) {
|
4455
|
+
const bool need_check = false;
|
4456
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4457
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4458
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4459
|
+
} else {
|
4460
|
+
const bool need_check = true;
|
4461
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4462
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4463
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4464
|
+
}
|
4465
|
+
}
|
4466
|
+
}
|
4467
|
+
|
2503
4468
|
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2504
4469
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
2505
4470
|
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
@@ -2544,12 +4509,13 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
2544
4509
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
2545
4510
|
}
|
2546
4511
|
|
2547
|
-
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
4512
|
+
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4513
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
2548
4514
|
GGML_ASSERT(nrows % 2 == 0);
|
2549
4515
|
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
2550
4516
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
2551
4517
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
2552
|
-
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
4518
|
+
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
2553
4519
|
}
|
2554
4520
|
|
2555
4521
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
@@ -2670,21 +4636,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
2670
4636
|
}
|
2671
4637
|
|
2672
4638
|
|
2673
|
-
static void * g_scratch_buffer = nullptr;
|
2674
|
-
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
2675
|
-
static size_t g_scratch_offset = 0;
|
2676
|
-
|
2677
|
-
static int g_device_count = -1;
|
2678
|
-
static int g_main_device = 0;
|
2679
|
-
#ifndef GGML_CUDA_FORCE_DMMV
|
2680
|
-
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
2681
|
-
#endif
|
2682
|
-
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
2683
|
-
|
2684
|
-
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
2685
|
-
|
2686
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
2687
|
-
|
2688
4639
|
void ggml_init_cublas() {
|
2689
4640
|
static bool initialized = false;
|
2690
4641
|
|
@@ -2701,9 +4652,7 @@ void ggml_init_cublas() {
|
|
2701
4652
|
g_tensor_split[id] = total_vram;
|
2702
4653
|
total_vram += prop.totalGlobalMem;
|
2703
4654
|
|
2704
|
-
#ifndef GGML_CUDA_FORCE_DMMV
|
2705
4655
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
2706
|
-
#endif
|
2707
4656
|
}
|
2708
4657
|
for (int id = 0; id < g_device_count; ++id) {
|
2709
4658
|
g_tensor_split[id] /= total_vram;
|
@@ -2965,6 +4914,114 @@ inline void ggml_cuda_op_rms_norm(
|
|
2965
4914
|
(void) i1;
|
2966
4915
|
}
|
2967
4916
|
|
4917
|
+
inline void ggml_cuda_op_mul_mat_q(
|
4918
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
4919
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
4920
|
+
cudaStream_t & cudaStream_main){
|
4921
|
+
|
4922
|
+
GGML_ASSERT(src0_ddq_i != nullptr);
|
4923
|
+
GGML_ASSERT(src1_ddf_i != nullptr);
|
4924
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
4925
|
+
|
4926
|
+
const int64_t ne00 = src0->ne[0];
|
4927
|
+
|
4928
|
+
const int64_t ne10 = src1->ne[0];
|
4929
|
+
const int64_t ne11 = src1->ne[1];
|
4930
|
+
GGML_ASSERT(ne10 % QK8_1 == 0);
|
4931
|
+
|
4932
|
+
const int64_t ne0 = dst->ne[0];
|
4933
|
+
|
4934
|
+
const int64_t i01_diff = i01_high - i01_low;
|
4935
|
+
|
4936
|
+
int id;
|
4937
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4938
|
+
|
4939
|
+
// the main device has a larger memory buffer to hold the results from all GPUs
|
4940
|
+
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
4941
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
|
4942
|
+
|
4943
|
+
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
4944
|
+
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
4945
|
+
size_t as;
|
4946
|
+
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
|
4947
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
|
4948
|
+
|
4949
|
+
switch (src0->type) {
|
4950
|
+
case GGML_TYPE_Q4_0:
|
4951
|
+
ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4952
|
+
break;
|
4953
|
+
case GGML_TYPE_Q4_1:
|
4954
|
+
ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4955
|
+
break;
|
4956
|
+
case GGML_TYPE_Q5_0:
|
4957
|
+
ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4958
|
+
break;
|
4959
|
+
case GGML_TYPE_Q5_1:
|
4960
|
+
ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4961
|
+
break;
|
4962
|
+
case GGML_TYPE_Q8_0:
|
4963
|
+
ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4964
|
+
break;
|
4965
|
+
case GGML_TYPE_Q2_K:
|
4966
|
+
ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4967
|
+
break;
|
4968
|
+
case GGML_TYPE_Q3_K:
|
4969
|
+
ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4970
|
+
break;
|
4971
|
+
case GGML_TYPE_Q4_K:
|
4972
|
+
ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4973
|
+
break;
|
4974
|
+
case GGML_TYPE_Q5_K:
|
4975
|
+
ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4976
|
+
break;
|
4977
|
+
case GGML_TYPE_Q6_K:
|
4978
|
+
ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
|
4979
|
+
break;
|
4980
|
+
default:
|
4981
|
+
GGML_ASSERT(false);
|
4982
|
+
break;
|
4983
|
+
}
|
4984
|
+
|
4985
|
+
ggml_cuda_pool_free(src1_q8_1, as);
|
4986
|
+
|
4987
|
+
(void) src1;
|
4988
|
+
(void) dst;
|
4989
|
+
(void) src0_ddf_i;
|
4990
|
+
(void) i02;
|
4991
|
+
(void) i1;
|
4992
|
+
}
|
4993
|
+
|
4994
|
+
static int64_t get_row_rounding(ggml_type type) {
|
4995
|
+
int max_compute_capability = INT_MIN;
|
4996
|
+
for (int id = 0; id < g_device_count; ++id) {
|
4997
|
+
if (max_compute_capability < g_compute_capabilities[id]
|
4998
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
4999
|
+
max_compute_capability = g_compute_capabilities[id];
|
5000
|
+
}
|
5001
|
+
}
|
5002
|
+
|
5003
|
+
switch(type) {
|
5004
|
+
case GGML_TYPE_Q4_0:
|
5005
|
+
case GGML_TYPE_Q4_1:
|
5006
|
+
return max_compute_capability >= CC_TURING ? 128 : 64;
|
5007
|
+
case GGML_TYPE_Q5_0:
|
5008
|
+
case GGML_TYPE_Q5_1:
|
5009
|
+
case GGML_TYPE_Q8_0:
|
5010
|
+
return 64;
|
5011
|
+
case GGML_TYPE_F16:
|
5012
|
+
return 1;
|
5013
|
+
case GGML_TYPE_Q2_K:
|
5014
|
+
case GGML_TYPE_Q3_K:
|
5015
|
+
case GGML_TYPE_Q4_K:
|
5016
|
+
case GGML_TYPE_Q5_K:
|
5017
|
+
return max_compute_capability >= CC_TURING ? 128 : 64;
|
5018
|
+
case GGML_TYPE_Q6_K:
|
5019
|
+
return 64;
|
5020
|
+
default:
|
5021
|
+
GGML_ASSERT(false);
|
5022
|
+
}
|
5023
|
+
}
|
5024
|
+
|
2968
5025
|
inline void ggml_cuda_op_mul_mat_vec(
|
2969
5026
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
2970
5027
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -2979,6 +5036,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2979
5036
|
|
2980
5037
|
#ifdef GGML_CUDA_FORCE_DMMV
|
2981
5038
|
const bool use_mul_mat_vec_q = false;
|
5039
|
+
(void) g_compute_capabilities[0];
|
2982
5040
|
#else
|
2983
5041
|
int id;
|
2984
5042
|
CUDA_CHECK(cudaGetDevice(&id));
|
@@ -3006,7 +5064,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3006
5064
|
ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
3007
5065
|
size_t as;
|
3008
5066
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
3009
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
5067
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
|
3010
5068
|
|
3011
5069
|
switch (src0->type) {
|
3012
5070
|
case GGML_TYPE_Q4_0:
|
@@ -3047,7 +5105,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3047
5105
|
ggml_cuda_pool_free(src1_q8_1, as);
|
3048
5106
|
} else {
|
3049
5107
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
3050
|
-
#ifdef
|
5108
|
+
#ifdef GGML_CUDA_F16
|
3051
5109
|
size_t ash;
|
3052
5110
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
3053
5111
|
|
@@ -3063,7 +5121,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3063
5121
|
}
|
3064
5122
|
#else
|
3065
5123
|
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
3066
|
-
#endif //
|
5124
|
+
#endif // GGML_CUDA_F16
|
3067
5125
|
|
3068
5126
|
switch (src0->type) {
|
3069
5127
|
case GGML_TYPE_Q4_0:
|
@@ -3104,11 +5162,11 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
3104
5162
|
break;
|
3105
5163
|
}
|
3106
5164
|
|
3107
|
-
#ifdef
|
5165
|
+
#ifdef GGML_CUDA_F16
|
3108
5166
|
if (src1_convert_f16) {
|
3109
5167
|
ggml_cuda_pool_free(src1_dfloat, ash);
|
3110
5168
|
}
|
3111
|
-
#endif //
|
5169
|
+
#endif // GGML_CUDA_F16
|
3112
5170
|
}
|
3113
5171
|
|
3114
5172
|
(void) src1;
|
@@ -3168,6 +5226,7 @@ inline void ggml_cuda_op_rope(
|
|
3168
5226
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
3169
5227
|
|
3170
5228
|
const int64_t ne00 = src0->ne[0];
|
5229
|
+
const int64_t ne01 = src0->ne[1];
|
3171
5230
|
const int64_t i01_diff = i01_high - i01_low;
|
3172
5231
|
|
3173
5232
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
@@ -3181,17 +5240,18 @@ inline void ggml_cuda_op_rope(
|
|
3181
5240
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
3182
5241
|
|
3183
5242
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
3184
|
-
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
3185
5243
|
|
3186
|
-
bool is_glm = mode & 4;
|
5244
|
+
const bool is_glm = mode & 4;
|
3187
5245
|
|
3188
5246
|
// compute
|
3189
5247
|
if (is_glm) {
|
5248
|
+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
3190
5249
|
const float id_p = min(p, n_ctx - 2.f);
|
3191
5250
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
3192
5251
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
3193
5252
|
} else {
|
3194
|
-
|
5253
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5254
|
+
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
3195
5255
|
}
|
3196
5256
|
|
3197
5257
|
(void) src1;
|
@@ -3362,8 +5422,17 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3362
5422
|
|
3363
5423
|
int64_t row_low, row_high;
|
3364
5424
|
if (split) {
|
5425
|
+
const int64_t rounding = get_row_rounding(src0->type);
|
5426
|
+
|
3365
5427
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
3366
|
-
|
5428
|
+
row_low -= row_low % rounding;
|
5429
|
+
|
5430
|
+
if (id == g_device_count - 1) {
|
5431
|
+
row_high = nrows0;
|
5432
|
+
} else {
|
5433
|
+
row_high = nrows0*g_tensor_split[id + 1];
|
5434
|
+
row_high -= row_high % rounding;
|
5435
|
+
}
|
3367
5436
|
} else {
|
3368
5437
|
row_low = 0;
|
3369
5438
|
row_high = nrows0*i02_divisor;
|
@@ -3529,13 +5598,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3529
5598
|
if (split) {
|
3530
5599
|
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
3531
5600
|
// dst is NOT transposed.
|
3532
|
-
// The outputs of
|
5601
|
+
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
3533
5602
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
3534
5603
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
3535
|
-
|
3536
|
-
|
3537
|
-
|
3538
|
-
}
|
5604
|
+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
|
5605
|
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
|
5606
|
+
i01_diff*sizeof(float), ne1, kind, cudaStream_main));
|
3539
5607
|
} else {
|
3540
5608
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
3541
5609
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
@@ -3576,7 +5644,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3576
5644
|
if (split && g_device_count > 1) {
|
3577
5645
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3578
5646
|
for (int id = 0; id < g_device_count; ++id) {
|
3579
|
-
if (id != g_main_device) {
|
5647
|
+
if (id != g_main_device && src0_extra->events[id]) {
|
3580
5648
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
3581
5649
|
}
|
3582
5650
|
}
|
@@ -3718,7 +5786,19 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
3718
5786
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
3719
5787
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
3720
5788
|
} else {
|
3721
|
-
|
5789
|
+
int min_compute_capability = INT_MAX;
|
5790
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5791
|
+
if (min_compute_capability > g_compute_capabilities[id]
|
5792
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5793
|
+
min_compute_capability = g_compute_capabilities[id];
|
5794
|
+
}
|
5795
|
+
}
|
5796
|
+
|
5797
|
+
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
5798
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
|
5799
|
+
} else {
|
5800
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
5801
|
+
}
|
3722
5802
|
}
|
3723
5803
|
} else {
|
3724
5804
|
GGML_ASSERT(false);
|
@@ -3795,7 +5875,10 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
|
3795
5875
|
|
3796
5876
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3797
5877
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
3798
|
-
|
5878
|
+
|
5879
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
5880
|
+
const bool is_glm = mode & 4;
|
5881
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
3799
5882
|
}
|
3800
5883
|
|
3801
5884
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3827,8 +5910,17 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3827
5910
|
row_low = 0;
|
3828
5911
|
row_high = nrows;
|
3829
5912
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
5913
|
+
const int64_t rounding = get_row_rounding(tensor->type);
|
5914
|
+
|
3830
5915
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
3831
|
-
|
5916
|
+
row_low -= row_low % rounding;
|
5917
|
+
|
5918
|
+
if (id == g_device_count - 1) {
|
5919
|
+
row_high = nrows;
|
5920
|
+
} else {
|
5921
|
+
row_high = nrows*g_tensor_split[id + 1];
|
5922
|
+
row_high -= row_high % rounding;
|
5923
|
+
}
|
3832
5924
|
} else {
|
3833
5925
|
GGML_ASSERT(false);
|
3834
5926
|
}
|
@@ -4002,6 +6094,10 @@ void ggml_cuda_set_main_device(int main_device) {
|
|
4002
6094
|
}
|
4003
6095
|
}
|
4004
6096
|
|
6097
|
+
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
|
6098
|
+
g_mul_mat_q = mul_mat_q;
|
6099
|
+
}
|
6100
|
+
|
4005
6101
|
void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
4006
6102
|
g_scratch_size = scratch_size;
|
4007
6103
|
}
|