llama_cpp 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/examples/README.md +32 -0
- data/examples/embedding.rb +37 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +553 -313
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +157 -19
- data/ext/llama_cpp/src/ggml-metal.metal +149 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +736 -98
- data/ext/llama_cpp/src/ggml.h +140 -9
- data/ext/llama_cpp/src/llama.cpp +58 -31
- data/ext/llama_cpp/src/llama.h +8 -9
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -2
@@ -13,6 +13,10 @@
|
|
13
13
|
#include "ggml-cuda.h"
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
|
+
#if defined(_MSC_VER)
|
17
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
18
|
+
#endif
|
19
|
+
|
16
20
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
17
21
|
|
18
22
|
#define CUDA_CHECK(err) \
|
@@ -46,7 +50,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
46
50
|
} while (0)
|
47
51
|
#endif // CUDART_VERSION >= 11
|
48
52
|
|
49
|
-
|
53
|
+
#ifdef GGML_CUDA_DMMV_F16
|
54
|
+
typedef half dfloat; // dequantize float
|
55
|
+
typedef half2 dfloat2;
|
56
|
+
#else
|
57
|
+
typedef float dfloat; // dequantize float
|
58
|
+
typedef float2 dfloat2;
|
59
|
+
#endif //GGML_CUDA_DMMV_F16
|
60
|
+
|
61
|
+
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
50
62
|
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
51
63
|
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
52
64
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
@@ -167,6 +179,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
167
179
|
#define GGML_CUDA_DMMV_Y 1
|
168
180
|
#endif
|
169
181
|
|
182
|
+
#ifndef K_QUANTS_PER_ITERATION
|
183
|
+
#define K_QUANTS_PER_ITERATION 2
|
184
|
+
#else
|
185
|
+
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
186
|
+
#endif
|
187
|
+
|
170
188
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
171
189
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
172
190
|
|
@@ -224,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
224
242
|
}
|
225
243
|
}
|
226
244
|
|
227
|
-
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs,
|
245
|
+
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
228
246
|
const block_q4_0 * x = (const block_q4_0 *) vx;
|
229
247
|
|
230
|
-
const
|
248
|
+
const dfloat d = x[ib].d;
|
231
249
|
|
232
|
-
const
|
250
|
+
const int vui = x[ib].qs[iqs];
|
233
251
|
|
234
|
-
|
235
|
-
|
252
|
+
v.x = vui & 0xF;
|
253
|
+
v.y = vui >> 4;
|
236
254
|
|
237
|
-
|
238
|
-
|
255
|
+
#ifdef GGML_CUDA_DMMV_F16
|
256
|
+
v = __hsub2(v, {8.0f, 8.0f});
|
257
|
+
v = __hmul2(v, {d, d});
|
258
|
+
#else
|
259
|
+
v.x = (v.x - 8.0f) * d;
|
260
|
+
v.y = (v.y - 8.0f) * d;
|
261
|
+
#endif // GGML_CUDA_DMMV_F16
|
239
262
|
}
|
240
263
|
|
241
|
-
static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs,
|
264
|
+
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
242
265
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
243
266
|
|
244
|
-
const
|
245
|
-
const
|
267
|
+
const dfloat d = x[ib].d;
|
268
|
+
const dfloat m = x[ib].m;
|
246
269
|
|
247
|
-
const
|
270
|
+
const int vui = x[ib].qs[iqs];
|
248
271
|
|
249
|
-
|
250
|
-
|
272
|
+
v.x = vui & 0xF;
|
273
|
+
v.y = vui >> 4;
|
251
274
|
|
252
|
-
|
253
|
-
|
275
|
+
#ifdef GGML_CUDA_DMMV_F16
|
276
|
+
v = __hmul2(v, {d, d});
|
277
|
+
v = __hadd2(v, {m, m});
|
278
|
+
#else
|
279
|
+
v.x = (v.x * d) + m;
|
280
|
+
v.y = (v.y * d) + m;
|
281
|
+
#endif // GGML_CUDA_DMMV_F16
|
254
282
|
}
|
255
283
|
|
256
|
-
static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs,
|
284
|
+
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
257
285
|
const block_q5_0 * x = (const block_q5_0 *) vx;
|
258
286
|
|
259
|
-
const
|
287
|
+
const dfloat d = x[ib].d;
|
260
288
|
|
261
289
|
uint32_t qh;
|
262
290
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
263
291
|
|
264
|
-
const
|
265
|
-
const
|
292
|
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
293
|
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
266
294
|
|
267
|
-
|
268
|
-
|
295
|
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
296
|
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
269
297
|
|
270
|
-
|
271
|
-
|
298
|
+
#ifdef GGML_CUDA_DMMV_F16
|
299
|
+
v = __hsub2(v, {16.0f, 16.0f});
|
300
|
+
v = __hmul2(v, {d, d});
|
301
|
+
#else
|
302
|
+
v.x = (v.x - 16.0f) * d;
|
303
|
+
v.y = (v.y - 16.0f) * d;
|
304
|
+
#endif // GGML_CUDA_DMMV_F16
|
272
305
|
}
|
273
306
|
|
274
|
-
static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs,
|
307
|
+
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
275
308
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
276
309
|
|
277
|
-
const
|
278
|
-
const
|
310
|
+
const dfloat d = x[ib].d;
|
311
|
+
const dfloat m = x[ib].m;
|
279
312
|
|
280
313
|
uint32_t qh;
|
281
314
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
282
315
|
|
283
|
-
const
|
284
|
-
const
|
316
|
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
317
|
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
285
318
|
|
286
|
-
|
287
|
-
|
319
|
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
320
|
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
288
321
|
|
289
|
-
|
290
|
-
|
322
|
+
#ifdef GGML_CUDA_DMMV_F16
|
323
|
+
v = __hmul2(v, {d, d});
|
324
|
+
v = __hadd2(v, {m, m});
|
325
|
+
#else
|
326
|
+
v.x = (v.x * d) + m;
|
327
|
+
v.y = (v.y * d) + m;
|
328
|
+
#endif // GGML_CUDA_DMMV_F16
|
291
329
|
}
|
292
330
|
|
293
|
-
static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs,
|
331
|
+
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
294
332
|
const block_q8_0 * x = (const block_q8_0 *) vx;
|
295
333
|
|
296
|
-
const
|
334
|
+
const dfloat d = x[ib].d;
|
297
335
|
|
298
|
-
|
299
|
-
|
336
|
+
v.x = x[ib].qs[iqs + 0];
|
337
|
+
v.y = x[ib].qs[iqs + 1];
|
300
338
|
|
301
|
-
|
302
|
-
|
339
|
+
#ifdef GGML_CUDA_DMMV_F16
|
340
|
+
v = __hmul2(v, {d, d});
|
341
|
+
#else
|
342
|
+
v.x *= d;
|
343
|
+
v.y *= d;
|
344
|
+
#endif // GGML_CUDA_DMMV_F16
|
303
345
|
}
|
304
346
|
|
305
347
|
//================================== k-quants
|
@@ -326,37 +368,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
|
326
368
|
|
327
369
|
}
|
328
370
|
|
329
|
-
static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
330
|
-
|
331
|
-
const block_q2_K * x = (const block_q2_K *) vx;
|
332
|
-
|
333
|
-
// if n is 0, we want to do the lower 128, else the upper 128,
|
334
|
-
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
335
|
-
// y[l+16], y[l+48], y[l+80], y[l+112]
|
336
|
-
int n = iqs/128; // 0 or 1
|
337
|
-
int r = iqs - 128*n; // 0...120 in steps of 8
|
338
|
-
int l = r/8; // 0...15 in steps of 1
|
339
|
-
|
340
|
-
const float * y = yy + 128*n + l;
|
341
|
-
const uint8_t * q = x[ib].qs + 32*n + l;
|
342
|
-
const uint8_t * s = x[ib].scales + 8*n;
|
343
|
-
|
344
|
-
const float dall = x[ib].d;
|
345
|
-
const float dmin = x[ib].dmin;
|
346
|
-
|
347
|
-
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
|
348
|
-
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
|
349
|
-
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
|
350
|
-
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
|
351
|
-
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
|
352
|
-
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
|
353
|
-
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
|
354
|
-
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
|
355
|
-
|
356
|
-
result = sum;
|
357
|
-
|
358
|
-
}
|
359
|
-
|
360
371
|
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
361
372
|
|
362
373
|
int r = threadIdx.x/4;
|
@@ -388,51 +399,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
|
388
399
|
|
389
400
|
}
|
390
401
|
|
391
|
-
static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
392
|
-
|
393
|
-
const block_q3_K * x = (const block_q3_K *) vx;
|
394
|
-
|
395
|
-
const uint32_t kmask1 = 0x03030303;
|
396
|
-
const uint32_t kmask2 = 0x0f0f0f0f;
|
397
|
-
|
398
|
-
uint32_t aux[3];
|
399
|
-
uint32_t utmp[4];
|
400
|
-
|
401
|
-
// if n is 0, we want to do the lower 128, else the upper 128,
|
402
|
-
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
403
|
-
// y[l+16], y[l+48], y[l+80], y[l+112]
|
404
|
-
int n = iqs/128; // 0 or 1
|
405
|
-
int r = iqs - 128*n; // 0...120 in steps of 8
|
406
|
-
int l = r/8; // 0...15 in steps of 1
|
407
|
-
|
408
|
-
const float * y = yy + 128*n + l;
|
409
|
-
const uint8_t * q = x[ib].qs + 32*n + l;
|
410
|
-
const uint8_t * hm = x[ib].hmask + l;
|
411
|
-
const int8_t * s = (const int8_t *)utmp + 8*n;
|
412
|
-
|
413
|
-
memcpy(aux, x[ib].scales, 12);
|
414
|
-
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
415
|
-
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
416
|
-
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
417
|
-
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
418
|
-
|
419
|
-
const float dall = x[ib].d;
|
420
|
-
|
421
|
-
const uint8_t m = 1 << (4*n);
|
422
|
-
|
423
|
-
float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
|
424
|
-
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
|
425
|
-
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
|
426
|
-
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
|
427
|
-
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
|
428
|
-
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
|
429
|
-
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
|
430
|
-
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
|
431
|
-
|
432
|
-
result = sum * dall;
|
433
|
-
|
434
|
-
}
|
435
|
-
|
436
402
|
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
437
403
|
if (j < 4) {
|
438
404
|
d = q[j] & 63; m = q[j + 4] & 63;
|
@@ -479,38 +445,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
|
479
445
|
}
|
480
446
|
}
|
481
447
|
|
482
|
-
static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
483
|
-
|
484
|
-
const block_q4_K * x = (const block_q4_K *) vx;
|
485
|
-
|
486
|
-
// iqs is in 0...248 in steps of 8 =>
|
487
|
-
const int j = iqs / 64; // j is in 0...3
|
488
|
-
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
489
|
-
const int is = 2*j; // is is in 0...6 in steps of 2
|
490
|
-
|
491
|
-
const float * y = yy + 64*j + ir;
|
492
|
-
const uint8_t * q = x[ib].qs + 32*j + ir;
|
493
|
-
|
494
|
-
const float dall = x[ib].d;
|
495
|
-
const float dmin = x[ib].dmin;
|
496
|
-
|
497
|
-
uint8_t sc, m;
|
498
|
-
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
|
499
|
-
const float d1 = dall * sc;
|
500
|
-
const float m1 = dmin * m;
|
501
|
-
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
|
502
|
-
const float d2 = dall * sc;
|
503
|
-
const float m2 = dmin * m;
|
504
|
-
|
505
|
-
float sum = 0;
|
506
|
-
for (int k = 0; k < 4; ++k) {
|
507
|
-
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
|
508
|
-
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
|
509
|
-
}
|
510
|
-
result = sum;
|
511
|
-
|
512
|
-
}
|
513
|
-
|
514
448
|
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
515
449
|
const block_q5_K * x = (const block_q5_K *) vx;
|
516
450
|
|
@@ -544,43 +478,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
|
544
478
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
545
479
|
}
|
546
480
|
|
547
|
-
static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
548
|
-
|
549
|
-
const block_q5_K * x = (const block_q5_K *) vx;
|
550
|
-
|
551
|
-
// iqs is in 0...248 in steps of 8 =>
|
552
|
-
const int j = iqs / 64; // j is in 0...3
|
553
|
-
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
554
|
-
const int is = 2*j; // is is in 0...6 in steps of 2
|
555
|
-
|
556
|
-
const float * y = yy + 64*j + ir;
|
557
|
-
const uint8_t * ql = x[ib].qs + 32*j + ir;
|
558
|
-
const uint8_t * qh = x[ib].qh + ir;
|
559
|
-
|
560
|
-
const float dall = x[ib].d;
|
561
|
-
const float dmin = x[ib].dmin;
|
562
|
-
|
563
|
-
uint8_t sc, m;
|
564
|
-
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
|
565
|
-
const float d1 = dall * sc;
|
566
|
-
const float m1 = dmin * m;
|
567
|
-
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
|
568
|
-
const float d2 = dall * sc;
|
569
|
-
const float m2 = dmin * m;
|
570
|
-
|
571
|
-
uint8_t hm = 1 << is;
|
572
|
-
float sum = 0;
|
573
|
-
for (int k = 0; k < 4; ++k) {
|
574
|
-
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
|
575
|
-
}
|
576
|
-
hm <<= 1;
|
577
|
-
for (int k = 0; k < 4; ++k) {
|
578
|
-
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
|
579
|
-
}
|
580
|
-
result = sum;
|
581
|
-
|
582
|
-
}
|
583
|
-
|
584
481
|
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
585
482
|
const block_q6_K * x = (const block_q6_K *) vx;
|
586
483
|
|
@@ -606,38 +503,395 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
|
606
503
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
607
504
|
}
|
608
505
|
|
609
|
-
static
|
506
|
+
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
610
507
|
|
611
|
-
|
508
|
+
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
509
|
+
|
510
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
511
|
+
if (row > nrows) return;
|
512
|
+
|
513
|
+
const int num_blocks_per_row = ncols / QK_K;
|
514
|
+
const int ib0 = row*num_blocks_per_row;
|
515
|
+
|
516
|
+
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
517
|
+
|
518
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
519
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
520
|
+
|
521
|
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
522
|
+
|
523
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
524
|
+
const int in = tid - step*im; // 0...15 or 0...7
|
525
|
+
|
526
|
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
527
|
+
const int q_offset = 32*im + l0;
|
528
|
+
const int s_offset = 8*im;
|
529
|
+
const int y_offset = 128*im + l0;
|
530
|
+
|
531
|
+
float tmp = 0; // partial sum for thread in warp
|
532
|
+
|
533
|
+
uint32_t aux[4];
|
534
|
+
const uint8_t * d = (const uint8_t *)aux;
|
535
|
+
const uint8_t * m = (const uint8_t *)(aux + 2);
|
536
|
+
|
537
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
538
|
+
|
539
|
+
const float * y = yy + i * QK_K + y_offset;
|
540
|
+
const uint8_t * q = x[i].qs + q_offset;
|
541
|
+
|
542
|
+
const float dall = x[i].d;
|
543
|
+
const float dmin = x[i].dmin;
|
544
|
+
|
545
|
+
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
546
|
+
aux[0] = a[0] & 0x0f0f0f0f;
|
547
|
+
aux[1] = a[1] & 0x0f0f0f0f;
|
548
|
+
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
|
549
|
+
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
|
550
|
+
|
551
|
+
float sum1 = 0, sum2 = 0;
|
552
|
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
553
|
+
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
|
554
|
+
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
|
555
|
+
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
|
556
|
+
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
|
557
|
+
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
|
558
|
+
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
|
559
|
+
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
|
560
|
+
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
|
561
|
+
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
|
562
|
+
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
|
563
|
+
|
564
|
+
}
|
565
|
+
tmp += dall * sum1 - dmin * sum2;
|
566
|
+
|
567
|
+
}
|
568
|
+
|
569
|
+
// sum up partial sums and write back result
|
570
|
+
__syncthreads();
|
571
|
+
#pragma unroll
|
572
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
573
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
574
|
+
}
|
575
|
+
|
576
|
+
if (tid == 0) {
|
577
|
+
dst[row] = tmp;
|
578
|
+
}
|
579
|
+
}
|
580
|
+
|
581
|
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
582
|
+
|
583
|
+
const uint16_t kmask1 = 0x0303;
|
584
|
+
const uint16_t kmask2 = 0x0f0f;
|
585
|
+
|
586
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
587
|
+
if (row > nrows) return;
|
588
|
+
|
589
|
+
const int num_blocks_per_row = ncols / QK_K;
|
590
|
+
const int ib0 = row*num_blocks_per_row;
|
591
|
+
|
592
|
+
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
593
|
+
|
594
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
595
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
596
|
+
|
597
|
+
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
598
|
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
599
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
600
|
+
const int in = tid - step*im; // 0....15 or 0...7
|
601
|
+
|
602
|
+
const uint8_t m = 1 << (4*im);
|
603
|
+
|
604
|
+
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
605
|
+
const int q_offset = 32*im + l0;
|
606
|
+
const int y_offset = 128*im + l0;
|
607
|
+
|
608
|
+
uint16_t utmp[4];
|
609
|
+
const int8_t * s = (const int8_t *)utmp;
|
610
|
+
|
611
|
+
const uint16_t s_shift = 4*im;
|
612
|
+
|
613
|
+
float tmp = 0; // partial sum for thread in warp
|
614
|
+
|
615
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
616
|
+
|
617
|
+
const float * y = yy + i * QK_K + y_offset;
|
618
|
+
const uint8_t * q = x[i].qs + q_offset;
|
619
|
+
const uint8_t * h = x[i].hmask + l0;
|
620
|
+
|
621
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
622
|
+
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
|
623
|
+
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
|
624
|
+
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
|
625
|
+
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
|
626
|
+
|
627
|
+
const float d = x[i].d;
|
628
|
+
|
629
|
+
float sum = 0;
|
630
|
+
for (int l = 0; l < n; ++l) {
|
631
|
+
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
|
632
|
+
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
|
633
|
+
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
|
634
|
+
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
|
635
|
+
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
|
636
|
+
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
|
637
|
+
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
|
638
|
+
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
|
639
|
+
}
|
640
|
+
tmp += d * sum;
|
641
|
+
|
642
|
+
}
|
643
|
+
|
644
|
+
// sum up partial sums and write back result
|
645
|
+
__syncthreads();
|
646
|
+
#pragma unroll
|
647
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
648
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
649
|
+
}
|
650
|
+
|
651
|
+
if (tid == 0) {
|
652
|
+
dst[row] = tmp;
|
653
|
+
}
|
654
|
+
}
|
655
|
+
|
656
|
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
657
|
+
|
658
|
+
const uint16_t kmask1 = 0x3f3f;
|
659
|
+
const uint16_t kmask2 = 0x0f0f;
|
660
|
+
const uint16_t kmask3 = 0xc0c0;
|
661
|
+
|
662
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
663
|
+
if (row > nrows) return;
|
664
|
+
const int num_blocks_per_row = ncols / QK_K;
|
665
|
+
const int ib0 = row*num_blocks_per_row;
|
666
|
+
|
667
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
668
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
669
|
+
|
670
|
+
const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
671
|
+
|
672
|
+
const int il = tid/step; // 0...3
|
673
|
+
const int ir = tid - step*il; // 0...7 or 0...3
|
674
|
+
const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
675
|
+
|
676
|
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
677
|
+
const int in = il%2;
|
678
|
+
|
679
|
+
const int l0 = n*(2*ir + in);
|
680
|
+
const int q_offset = 32*im + l0;
|
681
|
+
const int y_offset = 64*im + l0;
|
682
|
+
|
683
|
+
uint16_t aux[4];
|
684
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
685
|
+
|
686
|
+
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
687
|
+
|
688
|
+
float tmp = 0; // partial sum for thread in warp
|
612
689
|
|
613
|
-
|
614
|
-
const int il = (iqs - 128*ip)/8; // 0...15
|
615
|
-
const int is = 8*ip;
|
690
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
616
691
|
|
617
|
-
|
692
|
+
const uint8_t * q1 = x[i].qs + q_offset;
|
693
|
+
const uint8_t * q2 = q1 + 64;
|
694
|
+
const float * y1 = yy + i*QK_K + y_offset;
|
695
|
+
const float * y2 = y1 + 128;
|
618
696
|
|
619
|
-
|
697
|
+
const float dall = x[i].d;
|
698
|
+
const float dmin = x[i].dmin;
|
620
699
|
|
621
|
-
|
622
|
-
|
623
|
-
|
700
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
701
|
+
aux[0] = a[im+0] & kmask1;
|
702
|
+
aux[1] = a[im+2] & kmask1;
|
703
|
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
704
|
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
624
705
|
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
706
|
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
707
|
+
float smin = 0;
|
708
|
+
for (int l = 0; l < n; ++l) {
|
709
|
+
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
|
710
|
+
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
|
711
|
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
712
|
+
}
|
713
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
714
|
+
|
715
|
+
}
|
716
|
+
|
717
|
+
// sum up partial sums and write back result
|
718
|
+
__syncthreads();
|
719
|
+
#pragma unroll
|
720
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
721
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
722
|
+
}
|
723
|
+
|
724
|
+
if (tid == 0) {
|
725
|
+
dst[row] = tmp;
|
726
|
+
}
|
727
|
+
}
|
728
|
+
|
729
|
+
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
730
|
+
|
731
|
+
const uint16_t kmask1 = 0x3f3f;
|
732
|
+
const uint16_t kmask2 = 0x0f0f;
|
733
|
+
const uint16_t kmask3 = 0xc0c0;
|
734
|
+
|
735
|
+
//const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
736
|
+
const int row = blockIdx.x;
|
737
|
+
const int num_blocks_per_row = ncols / QK_K;
|
738
|
+
const int ib0 = row*num_blocks_per_row;
|
739
|
+
|
740
|
+
const int tid = threadIdx.x/2; // 0...15
|
741
|
+
const int ix = threadIdx.x%2;
|
742
|
+
|
743
|
+
const int il = tid/4; // 0...3
|
744
|
+
const int ir = tid - 4*il;// 0...3
|
745
|
+
const int n = 2;
|
746
|
+
|
747
|
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
748
|
+
const int in = il%2;
|
749
|
+
|
750
|
+
const int l0 = n*(2*ir + in);
|
751
|
+
const int q_offset = 32*im + l0;
|
752
|
+
const int y_offset = 64*im + l0;
|
753
|
+
|
754
|
+
const uint8_t hm1 = 1 << (2*im);
|
755
|
+
const uint8_t hm2 = hm1 << 4;
|
756
|
+
|
757
|
+
uint16_t aux[4];
|
758
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
759
|
+
|
760
|
+
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
761
|
+
|
762
|
+
float tmp = 0; // partial sum for thread in warp
|
763
|
+
|
764
|
+
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
765
|
+
|
766
|
+
const uint8_t * ql1 = x[i].qs + q_offset;
|
767
|
+
const uint8_t * ql2 = ql1 + 64;
|
768
|
+
const uint8_t * qh = x[i].qh + l0;
|
769
|
+
const float * y1 = yy + i*QK_K + y_offset;
|
770
|
+
const float * y2 = y1 + 128;
|
771
|
+
|
772
|
+
const float dall = x[i].d;
|
773
|
+
const float dmin = x[i].dmin;
|
774
|
+
|
775
|
+
const uint16_t * a = (const uint16_t *)x[i].scales;
|
776
|
+
aux[0] = a[im+0] & kmask1;
|
777
|
+
aux[1] = a[im+2] & kmask1;
|
778
|
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
779
|
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
780
|
+
|
781
|
+
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
782
|
+
float smin = 0;
|
783
|
+
for (int l = 0; l < n; ++l) {
|
784
|
+
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
785
|
+
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
786
|
+
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
787
|
+
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
788
|
+
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
789
|
+
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
790
|
+
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
791
|
+
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
792
|
+
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
793
|
+
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
794
|
+
}
|
795
|
+
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
796
|
+
|
797
|
+
}
|
798
|
+
|
799
|
+
// sum up partial sums and write back result
|
800
|
+
__syncthreads();
|
801
|
+
#pragma unroll
|
802
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
803
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
804
|
+
}
|
805
|
+
|
806
|
+
if (tid == 0) {
|
807
|
+
dst[row] = tmp;
|
808
|
+
}
|
809
|
+
}
|
810
|
+
|
811
|
+
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
812
|
+
|
813
|
+
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
814
|
+
|
815
|
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
816
|
+
if (row > nrows) return;
|
817
|
+
|
818
|
+
const int num_blocks_per_row = ncols / QK_K;
|
819
|
+
const int ib0 = row*num_blocks_per_row;
|
820
|
+
|
821
|
+
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
822
|
+
|
823
|
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
824
|
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
825
|
+
|
826
|
+
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
827
|
+
|
828
|
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
829
|
+
const int in = tid - step*im; // 0...15 or 0...7
|
830
|
+
|
831
|
+
#if K_QUANTS_PER_ITERATION == 1
|
832
|
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
833
|
+
const int is = 0;
|
834
|
+
#else
|
835
|
+
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
836
|
+
const int is = in / 4;
|
837
|
+
#endif
|
838
|
+
const int ql_offset = 64*im + l0;
|
839
|
+
const int qh_offset = 32*im + l0;
|
840
|
+
const int s_offset = 8*im + is;
|
841
|
+
const int y_offset = 128*im + l0;
|
842
|
+
|
843
|
+
float tmp = 0; // partial sum for thread in warp
|
844
|
+
|
845
|
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
846
|
+
|
847
|
+
const float * y = yy + i * QK_K + y_offset;
|
848
|
+
const uint8_t * ql = x[i].ql + ql_offset;
|
849
|
+
const uint8_t * qh = x[i].qh + qh_offset;
|
850
|
+
const int8_t * s = x[i].scales + s_offset;
|
851
|
+
|
852
|
+
const float d = x[i].d;
|
853
|
+
|
854
|
+
#if K_QUANTS_PER_ITERATION == 1
|
855
|
+
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
856
|
+
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
857
|
+
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
858
|
+
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
859
|
+
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
860
|
+
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
861
|
+
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
862
|
+
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
863
|
+
tmp += sum;
|
864
|
+
#else
|
865
|
+
float sum = 0;
|
866
|
+
for (int l = 0; l < 4; ++l) {
|
867
|
+
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
868
|
+
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
|
869
|
+
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
|
870
|
+
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
871
|
+
}
|
872
|
+
tmp += sum;
|
873
|
+
#endif
|
874
|
+
|
875
|
+
}
|
876
|
+
|
877
|
+
// sum up partial sums and write back result
|
878
|
+
__syncthreads();
|
879
|
+
#pragma unroll
|
880
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
881
|
+
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
882
|
+
}
|
633
883
|
|
884
|
+
if (tid == 0) {
|
885
|
+
dst[row] = tmp;
|
886
|
+
}
|
634
887
|
}
|
635
888
|
|
636
|
-
static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
889
|
+
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
637
890
|
const half * x = (const half *) vx;
|
638
891
|
|
639
|
-
|
640
|
-
|
892
|
+
// automatic half -> float type cast if dfloat == float
|
893
|
+
v.x = x[ib + iqs + 0];
|
894
|
+
v.y = x[ib + iqs + 1];
|
641
895
|
}
|
642
896
|
|
643
897
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -654,13 +908,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
654
908
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
655
909
|
|
656
910
|
// dequantize
|
657
|
-
|
658
|
-
|
659
|
-
|
911
|
+
dfloat2 v;
|
912
|
+
dequantize_kernel(vx, ib, iqs, v);
|
913
|
+
|
914
|
+
y[iybs + iqs + 0] = v.x;
|
915
|
+
y[iybs + iqs + y_offset] = v.y;
|
660
916
|
}
|
661
917
|
|
662
918
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
663
|
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const
|
919
|
+
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
664
920
|
// qk = quantized weights per x block
|
665
921
|
// qr = number of quantized weights per data value in x block
|
666
922
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -675,7 +931,12 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
675
931
|
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
|
676
932
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
677
933
|
|
678
|
-
|
934
|
+
// partial sum for each thread
|
935
|
+
#ifdef GGML_CUDA_DMMV_F16
|
936
|
+
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
937
|
+
#else
|
938
|
+
float tmp = 0.0f;
|
939
|
+
#endif // GGML_CUDA_DMMV_F16
|
679
940
|
|
680
941
|
for (int i = 0; i < ncols; i += iter_stride) {
|
681
942
|
const int col = i + vals_per_iter*tid;
|
@@ -689,14 +950,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
689
950
|
// process 2 vals per j iter
|
690
951
|
|
691
952
|
// dequantize
|
692
|
-
float v0, v1;
|
693
|
-
dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
|
694
953
|
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
954
|
+
dfloat2 v;
|
955
|
+
dequantize_kernel(vx, ib, iqs + j/qr, v);
|
695
956
|
|
696
957
|
// matrix multiplication
|
697
|
-
tmp += v0 * y[iybs + iqs + j/qr + 0];
|
698
|
-
tmp += v1 * y[iybs + iqs + j/qr + y_offset];
|
699
958
|
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
959
|
+
#ifdef GGML_CUDA_DMMV_F16
|
960
|
+
tmp += __hmul2(v, {
|
961
|
+
y[iybs + iqs + j/qr + 0],
|
962
|
+
y[iybs + iqs + j/qr + y_offset]
|
963
|
+
});
|
964
|
+
#else
|
965
|
+
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
966
|
+
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
967
|
+
#endif // GGML_CUDA_DMMV_F16
|
700
968
|
}
|
701
969
|
}
|
702
970
|
|
@@ -708,47 +976,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
708
976
|
}
|
709
977
|
|
710
978
|
if (tid == 0) {
|
979
|
+
#ifdef GGML_CUDA_DMMV_F16
|
980
|
+
dst[row] = tmp.x + tmp.y;
|
981
|
+
#else
|
711
982
|
dst[row] = tmp;
|
712
|
-
|
713
|
-
}
|
714
|
-
|
715
|
-
template <int n_thread, dot_kernel_k_t dot_kernel>
|
716
|
-
static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
|
717
|
-
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
718
|
-
|
719
|
-
if (row >= nrows) {
|
720
|
-
return;
|
721
|
-
}
|
722
|
-
|
723
|
-
const int tid = threadIdx.x;
|
724
|
-
|
725
|
-
const int iter_stride = QK_K;
|
726
|
-
const int vals_per_iter = iter_stride / n_thread;
|
727
|
-
const int num_blocks_per_row = ncols / QK_K;
|
728
|
-
const int ib0 = row*num_blocks_per_row;
|
729
|
-
|
730
|
-
float tmp = 0; // partial sum for thread in warp
|
731
|
-
|
732
|
-
for (int i = 0; i < ncols; i += iter_stride) {
|
733
|
-
const int col = i + vals_per_iter*tid;
|
734
|
-
const int ib = ib0 + col/QK_K; // x block index
|
735
|
-
const int iqs = col%QK_K; // x quant index
|
736
|
-
const int iybs = col - col%QK_K; // y block start index
|
737
|
-
|
738
|
-
float v;
|
739
|
-
dot_kernel(vx, ib, iqs, y + iybs, v);
|
740
|
-
tmp += v;
|
741
|
-
}
|
742
|
-
|
743
|
-
// sum up partial sums and write back result
|
744
|
-
__syncthreads();
|
745
|
-
#pragma unroll
|
746
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
747
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
748
|
-
}
|
749
|
-
|
750
|
-
if (tid == 0) {
|
751
|
-
dst[row] = tmp;
|
983
|
+
#endif // GGML_CUDA_DMMV_F16
|
752
984
|
}
|
753
985
|
}
|
754
986
|
|
@@ -1043,7 +1275,7 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
1043
1275
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
1044
1276
|
}
|
1045
1277
|
|
1046
|
-
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const
|
1278
|
+
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1047
1279
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1048
1280
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1049
1281
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1052,7 +1284,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, f
|
|
1052
1284
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1053
1285
|
}
|
1054
1286
|
|
1055
|
-
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const
|
1287
|
+
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1056
1288
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1057
1289
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1058
1290
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1061,7 +1293,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, f
|
|
1061
1293
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1062
1294
|
}
|
1063
1295
|
|
1064
|
-
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const
|
1296
|
+
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1065
1297
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1066
1298
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1067
1299
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1070,7 +1302,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, f
|
|
1070
1302
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1071
1303
|
}
|
1072
1304
|
|
1073
|
-
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const
|
1305
|
+
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1074
1306
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1075
1307
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1076
1308
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1079,7 +1311,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, f
|
|
1079
1311
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1080
1312
|
}
|
1081
1313
|
|
1082
|
-
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const
|
1314
|
+
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1083
1315
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1084
1316
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1085
1317
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1090,47 +1322,44 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
|
|
1090
1322
|
|
1091
1323
|
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1092
1324
|
GGML_ASSERT(ncols % QK_K == 0);
|
1093
|
-
const int ny = 2;
|
1325
|
+
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
1094
1326
|
const int block_num_y = (nrows + ny - 1) / ny;
|
1095
1327
|
const dim3 block_nums(1, block_num_y, 1);
|
1096
1328
|
const dim3 block_dims(32, ny, 1);
|
1097
|
-
|
1329
|
+
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1098
1330
|
}
|
1099
1331
|
|
1100
1332
|
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1101
1333
|
GGML_ASSERT(ncols % QK_K == 0);
|
1102
|
-
const int ny = 2;
|
1334
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1103
1335
|
const int block_num_y = (nrows + ny - 1) / ny;
|
1104
1336
|
const dim3 block_nums(1, block_num_y, 1);
|
1105
1337
|
const dim3 block_dims(32, ny, 1);
|
1106
|
-
|
1338
|
+
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1107
1339
|
}
|
1108
1340
|
|
1109
1341
|
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1110
1342
|
GGML_ASSERT(ncols % QK_K == 0);
|
1111
|
-
const int ny = 2;
|
1343
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1112
1344
|
const int block_num_y = (nrows + ny - 1) / ny;
|
1113
1345
|
const dim3 block_nums(1, block_num_y, 1);
|
1114
1346
|
const dim3 block_dims(32, ny, 1);
|
1115
|
-
|
1347
|
+
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1116
1348
|
}
|
1117
1349
|
|
1118
1350
|
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1119
1351
|
GGML_ASSERT(ncols % QK_K == 0);
|
1120
|
-
const
|
1121
|
-
|
1122
|
-
const dim3 block_nums(1, block_num_y, 1);
|
1123
|
-
const dim3 block_dims(32, ny, 1);
|
1124
|
-
dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1352
|
+
const dim3 block_dims(32, 1, 1);
|
1353
|
+
dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
1125
1354
|
}
|
1126
1355
|
|
1127
1356
|
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1128
1357
|
GGML_ASSERT(ncols % QK_K == 0);
|
1129
|
-
const int ny = 2;
|
1358
|
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1130
1359
|
const int block_num_y = (nrows + ny - 1) / ny;
|
1131
1360
|
const dim3 block_nums(1, block_num_y, 1);
|
1132
1361
|
const dim3 block_dims(32, ny, 1);
|
1133
|
-
|
1362
|
+
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1134
1363
|
}
|
1135
1364
|
|
1136
1365
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
@@ -1138,7 +1367,7 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
1138
1367
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
1139
1368
|
}
|
1140
1369
|
|
1141
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const
|
1370
|
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1142
1371
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1143
1372
|
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1144
1373
|
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1306,19 +1535,13 @@ static void * g_scratch_buffer = nullptr;
|
|
1306
1535
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
1307
1536
|
static size_t g_scratch_offset = 0;
|
1308
1537
|
|
1309
|
-
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
|
1310
|
-
#define GGML_CUDA_MAX_EVENTS 64
|
1311
|
-
|
1312
1538
|
static int g_device_count = -1;
|
1313
1539
|
static int g_main_device = 0;
|
1314
1540
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1315
1541
|
|
1316
1542
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1317
1543
|
|
1318
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES]
|
1319
|
-
|
1320
|
-
static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
|
1321
|
-
static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
|
1544
|
+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
1322
1545
|
|
1323
1546
|
void ggml_init_cublas() {
|
1324
1547
|
static bool initialized = false;
|
@@ -1342,15 +1565,8 @@ void ggml_init_cublas() {
|
|
1342
1565
|
for (int id = 0; id < g_device_count; ++id) {
|
1343
1566
|
CUDA_CHECK(cudaSetDevice(id));
|
1344
1567
|
|
1345
|
-
// create
|
1346
|
-
|
1347
|
-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
|
1348
|
-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
|
1349
|
-
}
|
1350
|
-
// create events
|
1351
|
-
for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
|
1352
|
-
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
|
1353
|
-
}
|
1568
|
+
// create main stream
|
1569
|
+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
|
1354
1570
|
|
1355
1571
|
// create cublas handle
|
1356
1572
|
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
@@ -1566,21 +1782,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1566
1782
|
const int64_t ne00 = src0->ne[0];
|
1567
1783
|
const int64_t nrows = i01_high - i01_low;
|
1568
1784
|
|
1785
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
1786
|
+
#ifdef GGML_CUDA_DMMV_F16
|
1787
|
+
size_t ash;
|
1788
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
1789
|
+
|
1790
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
1791
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
1792
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
1793
|
+
|
1794
|
+
if (src1_convert_f16) {
|
1795
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
1796
|
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
1797
|
+
ne00, 1, sizeof(float), 0, 0,
|
1798
|
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
1799
|
+
}
|
1800
|
+
#else
|
1801
|
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
1802
|
+
#endif // GGML_CUDA_DMMV_F16
|
1803
|
+
|
1569
1804
|
switch (src0->type) {
|
1570
1805
|
case GGML_TYPE_Q4_0:
|
1571
|
-
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i,
|
1806
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1572
1807
|
break;
|
1573
1808
|
case GGML_TYPE_Q4_1:
|
1574
|
-
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i,
|
1809
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1575
1810
|
break;
|
1576
1811
|
case GGML_TYPE_Q5_0:
|
1577
|
-
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i,
|
1812
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1578
1813
|
break;
|
1579
1814
|
case GGML_TYPE_Q5_1:
|
1580
|
-
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i,
|
1815
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1581
1816
|
break;
|
1582
1817
|
case GGML_TYPE_Q8_0:
|
1583
|
-
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i,
|
1818
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1584
1819
|
break;
|
1585
1820
|
case GGML_TYPE_Q2_K:
|
1586
1821
|
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
@@ -1598,7 +1833,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1598
1833
|
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1599
1834
|
break;
|
1600
1835
|
case GGML_TYPE_F16:
|
1601
|
-
convert_mul_mat_vec_f16_cuda(src0_ddq_i,
|
1836
|
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1602
1837
|
break;
|
1603
1838
|
default:
|
1604
1839
|
GGML_ASSERT(false);
|
@@ -1606,6 +1841,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1606
1841
|
}
|
1607
1842
|
CUDA_CHECK(cudaGetLastError());
|
1608
1843
|
|
1844
|
+
#ifdef GGML_CUDA_DMMV_F16
|
1845
|
+
if (src1_convert_f16) {
|
1846
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
1847
|
+
}
|
1848
|
+
#endif // GGML_CUDA_DMMV_F16
|
1849
|
+
|
1609
1850
|
(void) src1;
|
1610
1851
|
(void) dst;
|
1611
1852
|
(void) src0_ddf_i;
|
@@ -1817,6 +2058,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1817
2058
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1818
2059
|
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1819
2060
|
|
2061
|
+
// if multiple GPUs are used they need to wait for the main GPU to finish
|
2062
|
+
if (split && g_device_count > 1) {
|
2063
|
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2064
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2065
|
+
}
|
2066
|
+
|
1820
2067
|
for (int id = 0; id < g_device_count; ++id) {
|
1821
2068
|
if (!split && id != g_main_device) {
|
1822
2069
|
continue;
|
@@ -1915,9 +2162,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1915
2162
|
}
|
1916
2163
|
const int64_t i11 = i13*ne12 + i12;
|
1917
2164
|
|
1918
|
-
cudaStream_t cudaStream_main
|
1919
|
-
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
|
1920
|
-
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
|
2165
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
1921
2166
|
|
1922
2167
|
// for split tensors the data begins at i0 == i0_offset_low
|
1923
2168
|
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
@@ -1945,14 +2190,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1945
2190
|
if (src1->backend == GGML_BACKEND_CPU) {
|
1946
2191
|
GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
|
1947
2192
|
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
|
1948
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1,
|
2193
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
|
1949
2194
|
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
1950
2195
|
if (id != g_main_device) {
|
1951
2196
|
GGML_ASSERT(!flatten_rows);
|
1952
2197
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
1953
2198
|
src1_ddf_i_source += i11*src1_stride;
|
1954
2199
|
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
|
1955
|
-
cudaMemcpyDeviceToDevice,
|
2200
|
+
cudaMemcpyDeviceToDevice, cudaStream_main));
|
1956
2201
|
}
|
1957
2202
|
} else if (src1_on_device && !src1_is_contiguous) {
|
1958
2203
|
GGML_ASSERT(!split);
|
@@ -1961,7 +2206,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1961
2206
|
GGML_ASSERT(false);
|
1962
2207
|
}
|
1963
2208
|
}
|
1964
|
-
CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
|
1965
2209
|
|
1966
2210
|
if (!src0_on_device || !src0_is_contiguous) {
|
1967
2211
|
if (src0_is_f32) {
|
@@ -1977,9 +2221,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1977
2221
|
CUDA_CHECK(cudaGetLastError());
|
1978
2222
|
}
|
1979
2223
|
|
1980
|
-
// wait with main stream until src1 memcpy is done
|
1981
|
-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
|
1982
|
-
|
1983
2224
|
// do the computation
|
1984
2225
|
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
1985
2226
|
|
@@ -2017,8 +2258,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2017
2258
|
|
2018
2259
|
// wait until each device is finished, then free their buffers
|
2019
2260
|
for (int id = 0; id < g_device_count; ++id) {
|
2261
|
+
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
|
2262
|
+
continue;
|
2263
|
+
}
|
2264
|
+
|
2020
2265
|
CUDA_CHECK(cudaSetDevice(id));
|
2021
2266
|
CUDA_CHECK(cudaDeviceSynchronize());
|
2267
|
+
|
2022
2268
|
if (src0_asq[id] > 0) {
|
2023
2269
|
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
2024
2270
|
}
|
@@ -2084,7 +2330,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
2084
2330
|
const int64_t ne02 = src0->ne[2];
|
2085
2331
|
|
2086
2332
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2087
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]
|
2333
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2088
2334
|
|
2089
2335
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2090
2336
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -2096,8 +2342,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
2096
2342
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2097
2343
|
|
2098
2344
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
2099
|
-
|
2100
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2101
2345
|
}
|
2102
2346
|
|
2103
2347
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -2115,7 +2359,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
2115
2359
|
const int64_t nb02 = src0->nb[2];
|
2116
2360
|
|
2117
2361
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2118
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]
|
2362
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2119
2363
|
|
2120
2364
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2121
2365
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -2130,8 +2374,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
2130
2374
|
const int channel_stride_x = nb02 / sizeof(half);
|
2131
2375
|
|
2132
2376
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
2133
|
-
|
2134
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2135
2377
|
}
|
2136
2378
|
|
2137
2379
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2187,7 +2429,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2187
2429
|
const int64_t nb12 = src1->nb[2];
|
2188
2430
|
|
2189
2431
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2190
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]
|
2432
|
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2191
2433
|
|
2192
2434
|
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2193
2435
|
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
@@ -2205,8 +2447,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2205
2447
|
GGML_ASSERT(false);
|
2206
2448
|
}
|
2207
2449
|
|
2208
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2209
|
-
|
2210
2450
|
(void) dst;
|
2211
2451
|
}
|
2212
2452
|
|