llama_cpp 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/examples/README.md +32 -0
- data/examples/embedding.rb +37 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +553 -313
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +157 -19
- data/ext/llama_cpp/src/ggml-metal.metal +149 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +736 -98
- data/ext/llama_cpp/src/ggml.h +140 -9
- data/ext/llama_cpp/src/llama.cpp +58 -31
- data/ext/llama_cpp/src/llama.h +8 -9
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -2
@@ -13,6 +13,10 @@
|
|
13
13
|
#include "ggml-cuda.h"
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
|
+
#if defined(_MSC_VER)
|
17
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
18
|
+
#endif
|
19
|
+
|
16
20
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
17
21
|
|
18
22
|
#define CUDA_CHECK(err) \
|
@@ -46,7 +50,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
46
50
|
} while (0)
|
47
51
|
#endif // CUDART_VERSION >= 11
|
48
52
|
|
49
|
-
|
53
|
+
#ifdef GGML_CUDA_DMMV_F16
|
54
|
+
typedef half dfloat; // dequantize float
|
55
|
+
typedef half2 dfloat2;
|
56
|
+
#else
|
57
|
+
typedef float dfloat; // dequantize float
|
58
|
+
typedef float2 dfloat2;
|
59
|
+
#endif //GGML_CUDA_DMMV_F16
|
60
|
+
|
61
|
+
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
50
62
|
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
51
63
|
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
52
64
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
@@ -167,6 +179,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
167
179
|
#define GGML_CUDA_DMMV_Y 1
|
168
180
|
#endif
|
169
181
|
|
182
|
+
#ifndef K_QUANTS_PER_ITERATION
|
183
|
+
#define K_QUANTS_PER_ITERATION 2
|
184
|
+
#else
|
185
|
+
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
186
|
+
#endif
|
187
|
+
|
170
188
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
171
189
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
172
190
|
|
@@ -224,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
224
242
|
}
|
225
243
|
}
|
226
244
|
|
227
|
-
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs,
|
245
|
+
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
228
246
|
const block_q4_0 * x = (const block_q4_0 *) vx;
|
229
247
|
|
230
|
-
const
|
248
|
+
const dfloat d = x[ib].d;
|
231
249
|
|
232
|
-
const
|
250
|
+
const int vui = x[ib].qs[iqs];
|
233
251
|
|
234
|
-
|
235
|
-
|
252
|
+
v.x = vui & 0xF;
|
253
|
+
v.y = vui >> 4;
|
236
254
|
|
237
|
-
|
238
|
-
|
255
|
+
#ifdef GGML_CUDA_DMMV_F16
|
256
|
+
v = __hsub2(v, {8.0f, 8.0f});
|
257
|
+
v = __hmul2(v, {d, d});
|
258
|
+
#else
|
259
|
+
v.x = (v.x - 8.0f) * d;
|
260
|
+
v.y = (v.y - 8.0f) * d;
|
261
|
+
#endif // GGML_CUDA_DMMV_F16
|
239
262
|
}
|
240
263
|
|
241
|
-
static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs,
|
264
|
+
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
242
265
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
243
266
|
|
244
|
-
const
|
245
|
-
const
|
267
|
+
const dfloat d = x[ib].d;
|
268
|
+
const dfloat m = x[ib].m;
|
246
269
|
|
247
|
-
const
|
270
|
+
const int vui = x[ib].qs[iqs];
|
248
271
|
|
249
|
-
|
250
|
-
|
272
|
+
v.x = vui & 0xF;
|
273
|
+
v.y = vui >> 4;
|
251
274
|
|
252
|
-
|
253
|
-
|
275
|
+
#ifdef GGML_CUDA_DMMV_F16
|
276
|
+
v = __hmul2(v, {d, d});
|
277
|
+
v = __hadd2(v, {m, m});
|
278
|
+
#else
|
279
|
+
v.x = (v.x * d) + m;
|
280
|
+
v.y = (v.y * d) + m;
|
281
|
+
#endif // GGML_CUDA_DMMV_F16
|
254
282
|
}
|
255
283
|
|
256
|
-
static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs,
|
284
|
+
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
257
285
|
const block_q5_0 * x = (const block_q5_0 *) vx;
|
258
286
|
|
259
|
-
const
|
287
|
+
const dfloat d = x[ib].d;
|
260
288
|
|
261
289
|
uint32_t qh;
|
262
290
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
263
291
|
|
264
|
-
const
|
265
|
-
const
|
292
|
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
293
|
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
266
294
|
|
267
|
-
|
268
|
-
|
295
|
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
296
|
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
269
297
|
|
270
|
-
|
271
|
-
|
298
|
+
#ifdef GGML_CUDA_DMMV_F16
|
299
|
+
v = __hsub2(v, {16.0f, 16.0f});
|
300
|
+
v = __hmul2(v, {d, d});
|
301
|
+
#else
|
302
|
+
v.x = (v.x - 16.0f) * d;
|
303
|
+
v.y = (v.y - 16.0f) * d;
|
304
|
+
#endif // GGML_CUDA_DMMV_F16
|
272
305
|
}
|
273
306
|
|
274
|
-
static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs,
|
307
|
+
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
275
308
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
276
309
|
|
277
|
-
const
|
278
|
-
const
|
310
|
+
const dfloat d = x[ib].d;
|
311
|
+
const dfloat m = x[ib].m;
|
279
312
|
|
280
313
|
uint32_t qh;
|
281
314
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
282
315
|
|
283
|
-
const
|
284
|
-
const
|
316
|
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
317
|
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
285
318
|
|
286
|
-
|
287
|
-
|
319
|
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
320
|
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
288
321
|
|
289
|
-
|
290
|
-
|
322
|
+
#ifdef GGML_CUDA_DMMV_F16
|
323
|
+
v = __hmul2(v, {d, d});
|
324
|
+
v = __hadd2(v, {m, m});
|
325
|
+
#else
|
326
|
+
v.x = (v.x * d) + m;
|
327
|
+
v.y = (v.y * d) + m;
|
328
|
+
#endif // GGML_CUDA_DMMV_F16
|
291
329
|
}
|
292
330
|
|
293
|
-
static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs,
|
331
|
+
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
294
332
|
const block_q8_0 * x = (const block_q8_0 *) vx;
|
295
333
|
|
296
|
-
const
|
334
|
+
const dfloat d = x[ib].d;
|
297
335
|
|
298
|
-
|
299
|
-
|
336
|
+
v.x = x[ib].qs[iqs + 0];
|
337
|
+
v.y = x[ib].qs[iqs + 1];
|
300
338
|
|
301
|
-
|
302
|
-
|
339
|
+
#ifdef GGML_CUDA_DMMV_F16
|
340
|
+
v = __hmul2(v, {d, d});
|
341
|
+
#else
|
342
|
+
v.x *= d;
|
343
|
+
v.y *= d;
|
344
|
+
#endif // GGML_CUDA_DMMV_F16
|
303
345
|
}
|
304
346
|
|
305
347
|
//================================== k-quants
|
@@ -326,37 +368,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
326
368
|
|
327
369
|
}
|
328
370
|
|
329
|
-
static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
330
|
-
|
331
|
-
const block_q2_K * x = (const block_q2_K *) vx;
|
332
|
-
|
333
|
-
// if n is 0, we want to do the lower 128, else the upper 128,
|
334
|
-
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
335
|
-
// y[l+16], y[l+48], y[l+80], y[l+112]
|
336
|
-
int n = iqs/128; // 0 or 1
|
337
|
-
int r = iqs - 128*n; // 0...120 in steps of 8
|
338
|
-
int l = r/8; // 0...15 in steps of 1
|
339
|
-
|
340
|
-
const float * y = yy + 128*n + l;
|
341
|
-
const uint8_t * q = x[ib].qs + 32*n + l;
|
342
|
-
const uint8_t * s = x[ib].scales + 8*n;
|
343
|
-
|
344
|
-
const float dall = x[ib].d;
|
345
|
-
const float dmin = x[ib].dmin;
|
346
|
-
|
347
|
-
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
|
348
|
-
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
|
349
|
-
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
|
350
|
-
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
|
351
|
-
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
|
352
|
-
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
|
353
|
-
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
|
354
|
-
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
|
355
|
-
|
356
|
-
result = sum;
|
357
|
-
|
358
|
-
}
|
359
|
-
|
360
371
|
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
361
372
|
|
362
373
|
int r = threadIdx.x/4;
|
@@ -388,51 +399,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
|
388
399
|
|
389
400
|
}
|
390
401
|
|
391
|
-
static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
392
|
-
|
393
|
-
const block_q3_K * x = (const block_q3_K *) vx;
|
394
|
-
|
395
|
-
const uint32_t kmask1 = 0x03030303;
|
396
|
-
const uint32_t kmask2 = 0x0f0f0f0f;
|
397
|
-
|
398
|
-
uint32_t aux[3];
|
399
|
-
uint32_t utmp[4];
|
400
|
-
|
401
|
-
// if n is 0, we want to do the lower 128, else the upper 128,
|
402
|
-
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
403
|
-
// y[l+16], y[l+48], y[l+80], y[l+112]
|
404
|
-
int n = iqs/128; // 0 or 1
|
405
|
-
int r = iqs - 128*n; // 0...120 in steps of 8
|
406
|
-
int l = r/8; // 0...15 in steps of 1
|
407
|
-
|
408
|
-
const float * y = yy + 128*n + l;
|
409
|
-
const uint8_t * q = x[ib].qs + 32*n + l;
|
410
|
-
const uint8_t * hm = x[ib].hmask + l;
|
411
|
-
const int8_t * s = (const int8_t *)utmp + 8*n;
|
412
|
-
|
413
|
-
memcpy(aux, x[ib].scales, 12);
|
414
|
-
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
415
|
-
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
416
|
-
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
417
|
-
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
418
|
-
|
419
|
-
const float dall = x[ib].d;
|
420
|
-
|
421
|
-
const uint8_t m = 1 << (4*n);
|
422
|
-
|
423
|
-
float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
|
424
|
-
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
|
425
|
-
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
|
426
|
-
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
|
427
|
-
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
|
428
|
-
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
|
429
|
-
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
|
430
|
-
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
|
431
|
-
|
432
|
-
result = sum * dall;
|
433
|
-
|
434
|
-
}
|
435
|
-
|
436
402
|
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
437
403
|
if (j < 4) {
|
438
404
|
d = q[j] & 63; m = q[j + 4] & 63;
|
@@ -479,38 +445,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
479
445
|
}
|
480
446
|
}
|
481
447
|
|
482
|
-
static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
483
|
-
|
484
|
-
const block_q4_K * x = (const block_q4_K *) vx;
|
485
|
-
|
486
|
-
// iqs is in 0...248 in steps of 8 =>
|
487
|
-
const int j = iqs / 64; // j is in 0...3
|
488
|
-
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
489
|
-
const int is = 2*j; // is is in 0...6 in steps of 2
|
490
|
-
|
491
|
-
const float * y = yy + 64*j + ir;
|
492
|
-
const uint8_t * q = x[ib].qs + 32*j + ir;
|
493
|
-
|
494
|
-
const float dall = x[ib].d;
|
495
|
-
const float dmin = x[ib].dmin;
|
496
|
-
|
497
|
-
uint8_t sc, m;
|
498
|
-
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
|
499
|
-
const float d1 = dall * sc;
|
500
|
-
const float m1 = dmin * m;
|
501
|
-
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
|
502
|
-
const float d2 = dall * sc;
|
503
|
-
const float m2 = dmin * m;
|
504
|
-
|
505
|
-
float sum = 0;
|
506
|
-
for (int k = 0; k < 4; ++k) {
|
507
|
-
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
|
508
|
-
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
|
509
|
-
}
|
510
|
-
result = sum;
|
511
|
-
|
512
|
-
}
|
513
|
-
|
514
448
|
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
515
449
|
const block_q5_K * x = (const block_q5_K *) vx;
|
516
450
|
|
@@ -544,43 +478,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
544
478
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
545
479
|
}
|
546
480
|
|
547
|
-
static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
548
|
-
|
549
|
-
const block_q5_K * x = (const block_q5_K *) vx;
|
550
|
-
|
551
|
-
// iqs is in 0...248 in steps of 8 =>
|
552
|
-
const int j = iqs / 64; // j is in 0...3
|
553
|
-
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
554
|
-
const int is = 2*j; // is is in 0...6 in steps of 2
|
555
|
-
|
556
|
-
const float * y = yy + 64*j + ir;
|
557
|
-
const uint8_t * ql = x[ib].qs + 32*j + ir;
|
558
|
-
const uint8_t * qh = x[ib].qh + ir;
|
559
|
-
|
560
|
-
const float dall = x[ib].d;
|
561
|
-
const float dmin = x[ib].dmin;
|
562
|
-
|
563
|
-
uint8_t sc, m;
|
564
|
-
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
|
565
|
-
const float d1 = dall * sc;
|
566
|
-
const float m1 = dmin * m;
|
567
|
-
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
|
568
|
-
const float d2 = dall * sc;
|
569
|
-
const float m2 = dmin * m;
|
570
|
-
|
571
|
-
uint8_t hm = 1 << is;
|
572
|
-
float sum = 0;
|
573
|
-
for (int k = 0; k < 4; ++k) {
|
574
|
-
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
|
575
|
-
}
|
576
|
-
hm <<= 1;
|
577
|
-
for (int k = 0; k < 4; ++k) {
|
578
|
-
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
|
579
|
-
}
|
580
|
-
result = sum;
|
581
|
-
|
582
|
-
}
|
583
|
-
|
584
481
|
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
585
482
|
const block_q6_K * x = (const block_q6_K *) vx;
|
586
483
|
|
@@ -606,38 +503,395 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
606
503
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
607
504
|
}
|
608
505
|
|
609
|
-
static
|
506
|
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
610
507
|
|
611
|
-
|
508
|
+
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
509
|
+
|
510
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
511
|
+
if (row > nrows) return;
|
512
|
+
|
513
|
+
const int num_blocks_per_row = ncols / QK_K;
|
514
|
+
const int ib0 = row*num_blocks_per_row;
|
515
|
+
|
516
|
+
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
517
|
+
|
518
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
519
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
520
|
+
|
521
|
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
522
|
+
|
523
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
524
|
+
const int in = tid - step*im; // 0...15 or 0...7
|
525
|
+
|
526
|
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
527
|
+
const int q_offset = 32*im + l0;
|
528
|
+
const int s_offset = 8*im;
|
529
|
+
const int y_offset = 128*im + l0;
|
530
|
+
|
531
|
+
float tmp = 0; // partial sum for thread in warp
|
532
|
+
|
533
|
+
uint32_t aux[4];
|
534
|
+
const uint8_t * d = (const uint8_t *)aux;
|
535
|
+
const uint8_t * m = (const uint8_t *)(aux + 2);
|
536
|
+
|
537
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
538
|
+
|
539
|
+
const float * y = yy + i * QK_K + y_offset;
|
540
|
+
const uint8_t * q = x[i].qs + q_offset;
|
541
|
+
|
542
|
+
const float dall = x[i].d;
|
543
|
+
const float dmin = x[i].dmin;
|
544
|
+
|
545
|
+
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
546
|
+
aux[0] = a[0] & 0x0f0f0f0f;
|
547
|
+
aux[1] = a[1] & 0x0f0f0f0f;
|
548
|
+
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
|
549
|
+
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
|
550
|
+
|
551
|
+
float sum1 = 0, sum2 = 0;
|
552
|
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
553
|
+
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
|
554
|
+
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
|
555
|
+
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
|
556
|
+
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
|
557
|
+
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
|
558
|
+
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
|
559
|
+
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
|
560
|
+
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
|
561
|
+
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
|
562
|
+
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
|
563
|
+
|
564
|
+
}
|
565
|
+
tmp += dall * sum1 - dmin * sum2;
|
566
|
+
|
567
|
+
}
|
568
|
+
|
569
|
+
// sum up partial sums and write back result
|
570
|
+
__syncthreads();
|
571
|
+
#pragma unroll
|
572
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
573
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
574
|
+
}
|
575
|
+
|
576
|
+
if (tid == 0) {
|
577
|
+
dst[row] = tmp;
|
578
|
+
}
|
579
|
+
}
|
580
|
+
|
581
|
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
582
|
+
|
583
|
+
const uint16_t kmask1 = 0x0303;
|
584
|
+
const uint16_t kmask2 = 0x0f0f;
|
585
|
+
|
586
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
587
|
+
if (row > nrows) return;
|
588
|
+
|
589
|
+
const int num_blocks_per_row = ncols / QK_K;
|
590
|
+
const int ib0 = row*num_blocks_per_row;
|
591
|
+
|
592
|
+
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
593
|
+
|
594
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
595
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
596
|
+
|
597
|
+
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
598
|
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
599
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
600
|
+
const int in = tid - step*im; // 0....15 or 0...7
|
601
|
+
|
602
|
+
const uint8_t m = 1 << (4*im);
|
603
|
+
|
604
|
+
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
605
|
+
const int q_offset = 32*im + l0;
|
606
|
+
const int y_offset = 128*im + l0;
|
607
|
+
|
608
|
+
uint16_t utmp[4];
|
609
|
+
const int8_t * s = (const int8_t *)utmp;
|
610
|
+
|
611
|
+
const uint16_t s_shift = 4*im;
|
612
|
+
|
613
|
+
float tmp = 0; // partial sum for thread in warp
|
614
|
+
|
615
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
616
|
+
|
617
|
+
const float * y = yy + i * QK_K + y_offset;
|
618
|
+
const uint8_t * q = x[i].qs + q_offset;
|
619
|
+
const uint8_t * h = x[i].hmask + l0;
|
620
|
+
|
621
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
622
|
+
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
|
623
|
+
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
|
624
|
+
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
|
625
|
+
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
|
626
|
+
|
627
|
+
const float d = x[i].d;
|
628
|
+
|
629
|
+
float sum = 0;
|
630
|
+
for (int l = 0; l < n; ++l) {
|
631
|
+
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
|
632
|
+
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
|
633
|
+
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
|
634
|
+
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
|
635
|
+
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
|
636
|
+
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
|
637
|
+
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
|
638
|
+
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
|
639
|
+
}
|
640
|
+
tmp += d * sum;
|
641
|
+
|
642
|
+
}
|
643
|
+
|
644
|
+
// sum up partial sums and write back result
|
645
|
+
__syncthreads();
|
646
|
+
#pragma unroll
|
647
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
648
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
649
|
+
}
|
650
|
+
|
651
|
+
if (tid == 0) {
|
652
|
+
dst[row] = tmp;
|
653
|
+
}
|
654
|
+
}
|
655
|
+
|
656
|
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
657
|
+
|
658
|
+
const uint16_t kmask1 = 0x3f3f;
|
659
|
+
const uint16_t kmask2 = 0x0f0f;
|
660
|
+
const uint16_t kmask3 = 0xc0c0;
|
661
|
+
|
662
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
663
|
+
if (row > nrows) return;
|
664
|
+
const int num_blocks_per_row = ncols / QK_K;
|
665
|
+
const int ib0 = row*num_blocks_per_row;
|
666
|
+
|
667
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
668
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
669
|
+
|
670
|
+
const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
671
|
+
|
672
|
+
const int il = tid/step; // 0...3
|
673
|
+
const int ir = tid - step*il; // 0...7 or 0...3
|
674
|
+
const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
675
|
+
|
676
|
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
677
|
+
const int in = il%2;
|
678
|
+
|
679
|
+
const int l0 = n*(2*ir + in);
|
680
|
+
const int q_offset = 32*im + l0;
|
681
|
+
const int y_offset = 64*im + l0;
|
682
|
+
|
683
|
+
uint16_t aux[4];
|
684
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
685
|
+
|
686
|
+
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
687
|
+
|
688
|
+
float tmp = 0; // partial sum for thread in warp
|
612
689
|
|
613
|
-
|
614
|
-
const int il = (iqs - 128*ip)/8; // 0...15
|
615
|
-
const int is = 8*ip;
|
690
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
616
691
|
|
617
|
-
|
692
|
+
const uint8_t * q1 = x[i].qs + q_offset;
|
693
|
+
const uint8_t * q2 = q1 + 64;
|
694
|
+
const float * y1 = yy + i*QK_K + y_offset;
|
695
|
+
const float * y2 = y1 + 128;
|
618
696
|
|
619
|
-
|
697
|
+
const float dall = x[i].d;
|
698
|
+
const float dmin = x[i].dmin;
|
620
699
|
|
621
|
-
|
622
|
-
|
623
|
-
|
700
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
701
|
+
aux[0] = a[im+0] & kmask1;
|
702
|
+
aux[1] = a[im+2] & kmask1;
|
703
|
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
704
|
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
624
705
|
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
706
|
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
707
|
+
float smin = 0;
|
708
|
+
for (int l = 0; l < n; ++l) {
|
709
|
+
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
|
710
|
+
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
|
711
|
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
712
|
+
}
|
713
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
714
|
+
|
715
|
+
}
|
716
|
+
|
717
|
+
// sum up partial sums and write back result
|
718
|
+
__syncthreads();
|
719
|
+
#pragma unroll
|
720
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
721
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
722
|
+
}
|
723
|
+
|
724
|
+
if (tid == 0) {
|
725
|
+
dst[row] = tmp;
|
726
|
+
}
|
727
|
+
}
|
728
|
+
|
729
|
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
730
|
+
|
731
|
+
const uint16_t kmask1 = 0x3f3f;
|
732
|
+
const uint16_t kmask2 = 0x0f0f;
|
733
|
+
const uint16_t kmask3 = 0xc0c0;
|
734
|
+
|
735
|
+
//const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
736
|
+
const int row = blockIdx.x;
|
737
|
+
const int num_blocks_per_row = ncols / QK_K;
|
738
|
+
const int ib0 = row*num_blocks_per_row;
|
739
|
+
|
740
|
+
const int tid = threadIdx.x/2; // 0...15
|
741
|
+
const int ix = threadIdx.x%2;
|
742
|
+
|
743
|
+
const int il = tid/4; // 0...3
|
744
|
+
const int ir = tid - 4*il;// 0...3
|
745
|
+
const int n = 2;
|
746
|
+
|
747
|
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
748
|
+
const int in = il%2;
|
749
|
+
|
750
|
+
const int l0 = n*(2*ir + in);
|
751
|
+
const int q_offset = 32*im + l0;
|
752
|
+
const int y_offset = 64*im + l0;
|
753
|
+
|
754
|
+
const uint8_t hm1 = 1 << (2*im);
|
755
|
+
const uint8_t hm2 = hm1 << 4;
|
756
|
+
|
757
|
+
uint16_t aux[4];
|
758
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
759
|
+
|
760
|
+
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
761
|
+
|
762
|
+
float tmp = 0; // partial sum for thread in warp
|
763
|
+
|
764
|
+
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
765
|
+
|
766
|
+
const uint8_t * ql1 = x[i].qs + q_offset;
|
767
|
+
const uint8_t * ql2 = ql1 + 64;
|
768
|
+
const uint8_t * qh = x[i].qh + l0;
|
769
|
+
const float * y1 = yy + i*QK_K + y_offset;
|
770
|
+
const float * y2 = y1 + 128;
|
771
|
+
|
772
|
+
const float dall = x[i].d;
|
773
|
+
const float dmin = x[i].dmin;
|
774
|
+
|
775
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
776
|
+
aux[0] = a[im+0] & kmask1;
|
777
|
+
aux[1] = a[im+2] & kmask1;
|
778
|
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
779
|
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
780
|
+
|
781
|
+
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
782
|
+
float smin = 0;
|
783
|
+
for (int l = 0; l < n; ++l) {
|
784
|
+
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
785
|
+
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
786
|
+
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
787
|
+
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
788
|
+
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
789
|
+
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
790
|
+
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
791
|
+
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
792
|
+
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
793
|
+
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
794
|
+
}
|
795
|
+
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
796
|
+
|
797
|
+
}
|
798
|
+
|
799
|
+
// sum up partial sums and write back result
|
800
|
+
__syncthreads();
|
801
|
+
#pragma unroll
|
802
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
803
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
804
|
+
}
|
805
|
+
|
806
|
+
if (tid == 0) {
|
807
|
+
dst[row] = tmp;
|
808
|
+
}
|
809
|
+
}
|
810
|
+
|
811
|
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
812
|
+
|
813
|
+
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
814
|
+
|
815
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
816
|
+
if (row > nrows) return;
|
817
|
+
|
818
|
+
const int num_blocks_per_row = ncols / QK_K;
|
819
|
+
const int ib0 = row*num_blocks_per_row;
|
820
|
+
|
821
|
+
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
822
|
+
|
823
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
824
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
825
|
+
|
826
|
+
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
827
|
+
|
828
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
829
|
+
const int in = tid - step*im; // 0...15 or 0...7
|
830
|
+
|
831
|
+
#if K_QUANTS_PER_ITERATION == 1
|
832
|
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
833
|
+
const int is = 0;
|
834
|
+
#else
|
835
|
+
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
836
|
+
const int is = in / 4;
|
837
|
+
#endif
|
838
|
+
const int ql_offset = 64*im + l0;
|
839
|
+
const int qh_offset = 32*im + l0;
|
840
|
+
const int s_offset = 8*im + is;
|
841
|
+
const int y_offset = 128*im + l0;
|
842
|
+
|
843
|
+
float tmp = 0; // partial sum for thread in warp
|
844
|
+
|
845
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
846
|
+
|
847
|
+
const float * y = yy + i * QK_K + y_offset;
|
848
|
+
const uint8_t * ql = x[i].ql + ql_offset;
|
849
|
+
const uint8_t * qh = x[i].qh + qh_offset;
|
850
|
+
const int8_t * s = x[i].scales + s_offset;
|
851
|
+
|
852
|
+
const float d = x[i].d;
|
853
|
+
|
854
|
+
#if K_QUANTS_PER_ITERATION == 1
|
855
|
+
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
856
|
+
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
857
|
+
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
858
|
+
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
859
|
+
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
860
|
+
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
861
|
+
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
862
|
+
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
863
|
+
tmp += sum;
|
864
|
+
#else
|
865
|
+
float sum = 0;
|
866
|
+
for (int l = 0; l < 4; ++l) {
|
867
|
+
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
868
|
+
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
|
869
|
+
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
|
870
|
+
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
871
|
+
}
|
872
|
+
tmp += sum;
|
873
|
+
#endif
|
874
|
+
|
875
|
+
}
|
876
|
+
|
877
|
+
// sum up partial sums and write back result
|
878
|
+
__syncthreads();
|
879
|
+
#pragma unroll
|
880
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
881
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
882
|
+
}
|
633
883
|
|
884
|
+
if (tid == 0) {
|
885
|
+
dst[row] = tmp;
|
886
|
+
}
|
634
887
|
}
|
635
888
|
|
636
|
-
static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
889
|
+
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
637
890
|
const half * x = (const half *) vx;
|
638
891
|
|
639
|
-
|
640
|
-
|
892
|
+
// automatic half -> float type cast if dfloat == float
|
893
|
+
v.x = x[ib + iqs + 0];
|
894
|
+
v.y = x[ib + iqs + 1];
|
641
895
|
}
|
642
896
|
|
643
897
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -654,13 +908,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
654
908
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
655
909
|
|
656
910
|
// dequantize
|
657
|
-
|
658
|
-
|
659
|
-
|
911
|
+
dfloat2 v;
|
912
|
+
dequantize_kernel(vx, ib, iqs, v);
|
913
|
+
|
914
|
+
y[iybs + iqs + 0] = v.x;
|
915
|
+
y[iybs + iqs + y_offset] = v.y;
|
660
916
|
}
|
661
917
|
|
662
918
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
663
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const
|
919
|
+
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
664
920
|
// qk = quantized weights per x block
|
665
921
|
// qr = number of quantized weights per data value in x block
|
666
922
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -675,7 +931,12 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
675
931
|
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
|
676
932
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
677
933
|
|
678
|
-
|
934
|
+
// partial sum for each thread
|
935
|
+
#ifdef GGML_CUDA_DMMV_F16
|
936
|
+
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
937
|
+
#else
|
938
|
+
float tmp = 0.0f;
|
939
|
+
#endif // GGML_CUDA_DMMV_F16
|
679
940
|
|
680
941
|
for (int i = 0; i < ncols; i += iter_stride) {
|
681
942
|
const int col = i + vals_per_iter*tid;
|
@@ -689,14 +950,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
689
950
|
// process 2 vals per j iter
|
690
951
|
|
691
952
|
// dequantize
|
692
|
-
float v0, v1;
|
693
|
-
dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
|
694
953
|
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
954
|
+
dfloat2 v;
|
955
|
+
dequantize_kernel(vx, ib, iqs + j/qr, v);
|
695
956
|
|
696
957
|
// matrix multiplication
|
697
|
-
tmp += v0 * y[iybs + iqs + j/qr + 0];
|
698
|
-
tmp += v1 * y[iybs + iqs + j/qr + y_offset];
|
699
958
|
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
959
|
+
#ifdef GGML_CUDA_DMMV_F16
|
960
|
+
tmp += __hmul2(v, {
|
961
|
+
y[iybs + iqs + j/qr + 0],
|
962
|
+
y[iybs + iqs + j/qr + y_offset]
|
963
|
+
});
|
964
|
+
#else
|
965
|
+
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
966
|
+
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
967
|
+
#endif // GGML_CUDA_DMMV_F16
|
700
968
|
}
|
701
969
|
}
|
702
970
|
|
@@ -708,47 +976,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
708
976
|
}
|
709
977
|
|
710
978
|
if (tid == 0) {
|
979
|
+
#ifdef GGML_CUDA_DMMV_F16
|
980
|
+
dst[row] = tmp.x + tmp.y;
|
981
|
+
#else
|
711
982
|
dst[row] = tmp;
|
712
|
-
|
713
|
-
}
|
714
|
-
|
715
|
-
template <int n_thread, dot_kernel_k_t dot_kernel>
|
716
|
-
static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
|
717
|
-
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
718
|
-
|
719
|
-
if (row >= nrows) {
|
720
|
-
return;
|
721
|
-
}
|
722
|
-
|
723
|
-
const int tid = threadIdx.x;
|
724
|
-
|
725
|
-
const int iter_stride = QK_K;
|
726
|
-
const int vals_per_iter = iter_stride / n_thread;
|
727
|
-
const int num_blocks_per_row = ncols / QK_K;
|
728
|
-
const int ib0 = row*num_blocks_per_row;
|
729
|
-
|
730
|
-
float tmp = 0; // partial sum for thread in warp
|
731
|
-
|
732
|
-
for (int i = 0; i < ncols; i += iter_stride) {
|
733
|
-
const int col = i + vals_per_iter*tid;
|
734
|
-
const int ib = ib0 + col/QK_K; // x block index
|
735
|
-
const int iqs = col%QK_K; // x quant index
|
736
|
-
const int iybs = col - col%QK_K; // y block start index
|
737
|
-
|
738
|
-
float v;
|
739
|
-
dot_kernel(vx, ib, iqs, y + iybs, v);
|
740
|
-
tmp += v;
|
741
|
-
}
|
742
|
-
|
743
|
-
// sum up partial sums and write back result
|
744
|
-
__syncthreads();
|
745
|
-
#pragma unroll
|
746
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
747
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
748
|
-
}
|
749
|
-
|
750
|
-
if (tid == 0) {
|
751
|
-
dst[row] = tmp;
|
983
|
+
#endif // GGML_CUDA_DMMV_F16
|
752
984
|
}
|
753
985
|
}
|
754
986
|
|
@@ -1043,7 +1275,7 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
1043
1275
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
1044
1276
|
}
|
1045
1277
|
|
1046
|
-
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const
|
1278
|
+
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1047
1279
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1048
1280
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1049
1281
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1052,7 +1284,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, f
|
|
1052
1284
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1053
1285
|
}
|
1054
1286
|
|
1055
|
-
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const
|
1287
|
+
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1056
1288
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1057
1289
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1058
1290
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1061,7 +1293,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, f
|
|
1061
1293
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1062
1294
|
}
|
1063
1295
|
|
1064
|
-
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const
|
1296
|
+
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1065
1297
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1066
1298
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1067
1299
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1070,7 +1302,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, f
|
|
1070
1302
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1071
1303
|
}
|
1072
1304
|
|
1073
|
-
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const
|
1305
|
+
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1074
1306
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1075
1307
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1076
1308
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1079,7 +1311,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, f
|
|
1079
1311
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1080
1312
|
}
|
1081
1313
|
|
1082
|
-
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const
|
1314
|
+
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1083
1315
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1084
1316
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1085
1317
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1090,47 +1322,44 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
|
|
1090
1322
|
|
1091
1323
|
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1092
1324
|
GGML_ASSERT(ncols % QK_K == 0);
|
1093
|
-
const int ny = 2;
|
1325
|
+
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
1094
1326
|
const int block_num_y = (nrows + ny - 1) / ny;
|
1095
1327
|
const dim3 block_nums(1, block_num_y, 1);
|
1096
1328
|
const dim3 block_dims(32, ny, 1);
|
1097
|
-
|
1329
|
+
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1098
1330
|
}
|
1099
1331
|
|
1100
1332
|
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1101
1333
|
GGML_ASSERT(ncols % QK_K == 0);
|
1102
|
-
const int ny = 2;
|
1334
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1103
1335
|
const int block_num_y = (nrows + ny - 1) / ny;
|
1104
1336
|
const dim3 block_nums(1, block_num_y, 1);
|
1105
1337
|
const dim3 block_dims(32, ny, 1);
|
1106
|
-
|
1338
|
+
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1107
1339
|
}
|
1108
1340
|
|
1109
1341
|
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1110
1342
|
GGML_ASSERT(ncols % QK_K == 0);
|
1111
|
-
const int ny = 2;
|
1343
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1112
1344
|
const int block_num_y = (nrows + ny - 1) / ny;
|
1113
1345
|
const dim3 block_nums(1, block_num_y, 1);
|
1114
1346
|
const dim3 block_dims(32, ny, 1);
|
1115
|
-
|
1347
|
+
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1116
1348
|
}
|
1117
1349
|
|
1118
1350
|
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1119
1351
|
GGML_ASSERT(ncols % QK_K == 0);
|
1120
|
-
const
|
1121
|
-
|
1122
|
-
const dim3 block_nums(1, block_num_y, 1);
|
1123
|
-
const dim3 block_dims(32, ny, 1);
|
1124
|
-
dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1352
|
+
const dim3 block_dims(32, 1, 1);
|
1353
|
+
dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
1125
1354
|
}
|
1126
1355
|
|
1127
1356
|
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1128
1357
|
GGML_ASSERT(ncols % QK_K == 0);
|
1129
|
-
const int ny = 2;
|
1358
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1130
1359
|
const int block_num_y = (nrows + ny - 1) / ny;
|
1131
1360
|
const dim3 block_nums(1, block_num_y, 1);
|
1132
1361
|
const dim3 block_dims(32, ny, 1);
|
1133
|
-
|
1362
|
+
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1134
1363
|
}
|
1135
1364
|
|
1136
1365
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -1138,7 +1367,7 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
1138
1367
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
1139
1368
|
}
|
1140
1369
|
|
1141
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const
|
1370
|
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1142
1371
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1143
1372
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1144
1373
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1306,19 +1535,13 @@ static void * g_scratch_buffer = nullptr;
|
|
1306
1535
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
1307
1536
|
static size_t g_scratch_offset = 0;
|
1308
1537
|
|
1309
|
-
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
|
1310
|
-
#define GGML_CUDA_MAX_EVENTS 64
|
1311
|
-
|
1312
1538
|
static int g_device_count = -1;
|
1313
1539
|
static int g_main_device = 0;
|
1314
1540
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1315
1541
|
|
1316
1542
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1317
1543
|
|
1318
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES]
|
1319
|
-
|
1320
|
-
static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
|
1321
|
-
static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
|
1544
|
+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
1322
1545
|
|
1323
1546
|
void ggml_init_cublas() {
|
1324
1547
|
static bool initialized = false;
|
@@ -1342,15 +1565,8 @@ void ggml_init_cublas() {
|
|
1342
1565
|
for (int id = 0; id < g_device_count; ++id) {
|
1343
1566
|
CUDA_CHECK(cudaSetDevice(id));
|
1344
1567
|
|
1345
|
-
// create
|
1346
|
-
|
1347
|
-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
|
1348
|
-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
|
1349
|
-
}
|
1350
|
-
// create events
|
1351
|
-
for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
|
1352
|
-
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
|
1353
|
-
}
|
1568
|
+
// create main stream
|
1569
|
+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
|
1354
1570
|
|
1355
1571
|
// create cublas handle
|
1356
1572
|
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
@@ -1566,21 +1782,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1566
1782
|
const int64_t ne00 = src0->ne[0];
|
1567
1783
|
const int64_t nrows = i01_high - i01_low;
|
1568
1784
|
|
1785
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
1786
|
+
#ifdef GGML_CUDA_DMMV_F16
|
1787
|
+
size_t ash;
|
1788
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
1789
|
+
|
1790
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
1791
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
1792
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
1793
|
+
|
1794
|
+
if (src1_convert_f16) {
|
1795
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
1796
|
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
1797
|
+
ne00, 1, sizeof(float), 0, 0,
|
1798
|
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
1799
|
+
}
|
1800
|
+
#else
|
1801
|
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
1802
|
+
#endif // GGML_CUDA_DMMV_F16
|
1803
|
+
|
1569
1804
|
switch (src0->type) {
|
1570
1805
|
case GGML_TYPE_Q4_0:
|
1571
|
-
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i,
|
1806
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1572
1807
|
break;
|
1573
1808
|
case GGML_TYPE_Q4_1:
|
1574
|
-
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i,
|
1809
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1575
1810
|
break;
|
1576
1811
|
case GGML_TYPE_Q5_0:
|
1577
|
-
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i,
|
1812
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1578
1813
|
break;
|
1579
1814
|
case GGML_TYPE_Q5_1:
|
1580
|
-
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i,
|
1815
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1581
1816
|
break;
|
1582
1817
|
case GGML_TYPE_Q8_0:
|
1583
|
-
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i,
|
1818
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1584
1819
|
break;
|
1585
1820
|
case GGML_TYPE_Q2_K:
|
1586
1821
|
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
@@ -1598,7 +1833,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1598
1833
|
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1599
1834
|
break;
|
1600
1835
|
case GGML_TYPE_F16:
|
1601
|
-
convert_mul_mat_vec_f16_cuda(src0_ddq_i,
|
1836
|
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1602
1837
|
break;
|
1603
1838
|
default:
|
1604
1839
|
GGML_ASSERT(false);
|
@@ -1606,6 +1841,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1606
1841
|
}
|
1607
1842
|
CUDA_CHECK(cudaGetLastError());
|
1608
1843
|
|
1844
|
+
#ifdef GGML_CUDA_DMMV_F16
|
1845
|
+
if (src1_convert_f16) {
|
1846
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
1847
|
+
}
|
1848
|
+
#endif // GGML_CUDA_DMMV_F16
|
1849
|
+
|
1609
1850
|
(void) src1;
|
1610
1851
|
(void) dst;
|
1611
1852
|
(void) src0_ddf_i;
|
@@ -1817,6 +2058,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1817
2058
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1818
2059
|
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1819
2060
|
|
2061
|
+
// if multiple GPUs are used they need to wait for the main GPU to finish
|
2062
|
+
if (split && g_device_count > 1) {
|
2063
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2064
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2065
|
+
}
|
2066
|
+
|
1820
2067
|
for (int id = 0; id < g_device_count; ++id) {
|
1821
2068
|
if (!split && id != g_main_device) {
|
1822
2069
|
continue;
|
@@ -1915,9 +2162,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1915
2162
|
}
|
1916
2163
|
const int64_t i11 = i13*ne12 + i12;
|
1917
2164
|
|
1918
|
-
cudaStream_t cudaStream_main
|
1919
|
-
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
|
1920
|
-
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
|
2165
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
1921
2166
|
|
1922
2167
|
// for split tensors the data begins at i0 == i0_offset_low
|
1923
2168
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
@@ -1945,14 +2190,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1945
2190
|
if (src1->backend == GGML_BACKEND_CPU) {
|
1946
2191
|
GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
|
1947
2192
|
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
|
1948
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1,
|
2193
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
|
1949
2194
|
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
1950
2195
|
if (id != g_main_device) {
|
1951
2196
|
GGML_ASSERT(!flatten_rows);
|
1952
2197
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
1953
2198
|
src1_ddf_i_source += i11*src1_stride;
|
1954
2199
|
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
|
1955
|
-
cudaMemcpyDeviceToDevice,
|
2200
|
+
cudaMemcpyDeviceToDevice, cudaStream_main));
|
1956
2201
|
}
|
1957
2202
|
} else if (src1_on_device && !src1_is_contiguous) {
|
1958
2203
|
GGML_ASSERT(!split);
|
@@ -1961,7 +2206,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1961
2206
|
GGML_ASSERT(false);
|
1962
2207
|
}
|
1963
2208
|
}
|
1964
|
-
CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
|
1965
2209
|
|
1966
2210
|
if (!src0_on_device || !src0_is_contiguous) {
|
1967
2211
|
if (src0_is_f32) {
|
@@ -1977,9 +2221,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1977
2221
|
CUDA_CHECK(cudaGetLastError());
|
1978
2222
|
}
|
1979
2223
|
|
1980
|
-
// wait with main stream until src1 memcpy is done
|
1981
|
-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
|
1982
|
-
|
1983
2224
|
// do the computation
|
1984
2225
|
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
1985
2226
|
|
@@ -2017,8 +2258,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2017
2258
|
|
2018
2259
|
// wait until each device is finished, then free their buffers
|
2019
2260
|
for (int id = 0; id < g_device_count; ++id) {
|
2261
|
+
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
|
2262
|
+
continue;
|
2263
|
+
}
|
2264
|
+
|
2020
2265
|
CUDA_CHECK(cudaSetDevice(id));
|
2021
2266
|
CUDA_CHECK(cudaDeviceSynchronize());
|
2267
|
+
|
2022
2268
|
if (src0_asq[id] > 0) {
|
2023
2269
|
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
2024
2270
|
}
|
@@ -2084,7 +2330,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
2084
2330
|
const int64_t ne02 = src0->ne[2];
|
2085
2331
|
|
2086
2332
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2087
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]
|
2333
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2088
2334
|
|
2089
2335
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2090
2336
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -2096,8 +2342,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
2096
2342
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2097
2343
|
|
2098
2344
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
2099
|
-
|
2100
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2101
2345
|
}
|
2102
2346
|
|
2103
2347
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -2115,7 +2359,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
2115
2359
|
const int64_t nb02 = src0->nb[2];
|
2116
2360
|
|
2117
2361
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2118
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]
|
2362
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2119
2363
|
|
2120
2364
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2121
2365
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -2130,8 +2374,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
2130
2374
|
const int channel_stride_x = nb02 / sizeof(half);
|
2131
2375
|
|
2132
2376
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
2133
|
-
|
2134
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2135
2377
|
}
|
2136
2378
|
|
2137
2379
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2187,7 +2429,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2187
2429
|
const int64_t nb12 = src1->nb[2];
|
2188
2430
|
|
2189
2431
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2190
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]
|
2432
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2191
2433
|
|
2192
2434
|
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2193
2435
|
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
@@ -2205,8 +2447,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2205
2447
|
GGML_ASSERT(false);
|
2206
2448
|
}
|
2207
2449
|
|
2208
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2209
|
-
|
2210
2450
|
(void) dst;
|
2211
2451
|
}
|
2212
2452
|
|