llama_cpp 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -13,6 +13,10 @@
13
13
  #include "ggml-cuda.h"
14
14
  #include "ggml.h"
15
15
 
16
+ #if defined(_MSC_VER)
17
+ #pragma warning(disable: 4244 4267) // possible loss of data
18
+ #endif
19
+
16
20
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
17
21
 
18
22
  #define CUDA_CHECK(err) \
@@ -46,7 +50,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
46
50
  } while (0)
47
51
  #endif // CUDART_VERSION >= 11
48
52
 
49
- typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
53
+ #ifdef GGML_CUDA_DMMV_F16
54
+ typedef half dfloat; // dequantize float
55
+ typedef half2 dfloat2;
56
+ #else
57
+ typedef float dfloat; // dequantize float
58
+ typedef float2 dfloat2;
59
+ #endif //GGML_CUDA_DMMV_F16
60
+
61
+ typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
50
62
  typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
51
63
  typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
52
64
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
@@ -167,6 +179,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
167
179
  #define GGML_CUDA_DMMV_Y 1
168
180
  #endif
169
181
 
182
+ #ifndef K_QUANTS_PER_ITERATION
183
+ #define K_QUANTS_PER_ITERATION 2
184
+ #else
185
+ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
186
+ #endif
187
+
170
188
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
171
189
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
172
190
 
@@ -224,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
224
242
  }
225
243
  }
226
244
 
227
- static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
245
+ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
228
246
  const block_q4_0 * x = (const block_q4_0 *) vx;
229
247
 
230
- const float d = x[ib].d;
248
+ const dfloat d = x[ib].d;
231
249
 
232
- const uint8_t vui = x[ib].qs[iqs];
250
+ const int vui = x[ib].qs[iqs];
233
251
 
234
- const int8_t vi0 = vui & 0xF;
235
- const int8_t vi1 = vui >> 4;
252
+ v.x = vui & 0xF;
253
+ v.y = vui >> 4;
236
254
 
237
- v0 = (vi0 - 8)*d;
238
- v1 = (vi1 - 8)*d;
255
+ #ifdef GGML_CUDA_DMMV_F16
256
+ v = __hsub2(v, {8.0f, 8.0f});
257
+ v = __hmul2(v, {d, d});
258
+ #else
259
+ v.x = (v.x - 8.0f) * d;
260
+ v.y = (v.y - 8.0f) * d;
261
+ #endif // GGML_CUDA_DMMV_F16
239
262
  }
240
263
 
241
- static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
264
+ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
242
265
  const block_q4_1 * x = (const block_q4_1 *) vx;
243
266
 
244
- const float d = x[ib].d;
245
- const float m = x[ib].m;
267
+ const dfloat d = x[ib].d;
268
+ const dfloat m = x[ib].m;
246
269
 
247
- const uint8_t vui = x[ib].qs[iqs];
270
+ const int vui = x[ib].qs[iqs];
248
271
 
249
- const int8_t vi0 = vui & 0xF;
250
- const int8_t vi1 = vui >> 4;
272
+ v.x = vui & 0xF;
273
+ v.y = vui >> 4;
251
274
 
252
- v0 = vi0*d + m;
253
- v1 = vi1*d + m;
275
+ #ifdef GGML_CUDA_DMMV_F16
276
+ v = __hmul2(v, {d, d});
277
+ v = __hadd2(v, {m, m});
278
+ #else
279
+ v.x = (v.x * d) + m;
280
+ v.y = (v.y * d) + m;
281
+ #endif // GGML_CUDA_DMMV_F16
254
282
  }
255
283
 
256
- static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
284
+ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
257
285
  const block_q5_0 * x = (const block_q5_0 *) vx;
258
286
 
259
- const float d = x[ib].d;
287
+ const dfloat d = x[ib].d;
260
288
 
261
289
  uint32_t qh;
262
290
  memcpy(&qh, x[ib].qh, sizeof(qh));
263
291
 
264
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
265
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
292
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
293
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
266
294
 
267
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
268
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
295
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
296
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
269
297
 
270
- v0 = x0*d;
271
- v1 = x1*d;
298
+ #ifdef GGML_CUDA_DMMV_F16
299
+ v = __hsub2(v, {16.0f, 16.0f});
300
+ v = __hmul2(v, {d, d});
301
+ #else
302
+ v.x = (v.x - 16.0f) * d;
303
+ v.y = (v.y - 16.0f) * d;
304
+ #endif // GGML_CUDA_DMMV_F16
272
305
  }
273
306
 
274
- static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
307
+ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
275
308
  const block_q5_1 * x = (const block_q5_1 *) vx;
276
309
 
277
- const float d = x[ib].d;
278
- const float m = x[ib].m;
310
+ const dfloat d = x[ib].d;
311
+ const dfloat m = x[ib].m;
279
312
 
280
313
  uint32_t qh;
281
314
  memcpy(&qh, x[ib].qh, sizeof(qh));
282
315
 
283
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
284
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
316
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
317
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
285
318
 
286
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
287
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
319
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
320
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
288
321
 
289
- v0 = x0*d + m;
290
- v1 = x1*d + m;
322
+ #ifdef GGML_CUDA_DMMV_F16
323
+ v = __hmul2(v, {d, d});
324
+ v = __hadd2(v, {m, m});
325
+ #else
326
+ v.x = (v.x * d) + m;
327
+ v.y = (v.y * d) + m;
328
+ #endif // GGML_CUDA_DMMV_F16
291
329
  }
292
330
 
293
- static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
331
+ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
294
332
  const block_q8_0 * x = (const block_q8_0 *) vx;
295
333
 
296
- const float d = x[ib].d;
334
+ const dfloat d = x[ib].d;
297
335
 
298
- const int8_t vi0 = x[ib].qs[iqs + 0];
299
- const int8_t vi1 = x[ib].qs[iqs + 1];
336
+ v.x = x[ib].qs[iqs + 0];
337
+ v.y = x[ib].qs[iqs + 1];
300
338
 
301
- v0 = vi0*d;
302
- v1 = vi1*d;
339
+ #ifdef GGML_CUDA_DMMV_F16
340
+ v = __hmul2(v, {d, d});
341
+ #else
342
+ v.x *= d;
343
+ v.y *= d;
344
+ #endif // GGML_CUDA_DMMV_F16
303
345
  }
304
346
 
305
347
  //================================== k-quants
@@ -326,37 +368,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
326
368
 
327
369
  }
328
370
 
329
- static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
330
-
331
- const block_q2_K * x = (const block_q2_K *) vx;
332
-
333
- // if n is 0, we want to do the lower 128, else the upper 128,
334
- // covering y[l+0], y[l+32], y[l+64], y[l+96] and
335
- // y[l+16], y[l+48], y[l+80], y[l+112]
336
- int n = iqs/128; // 0 or 1
337
- int r = iqs - 128*n; // 0...120 in steps of 8
338
- int l = r/8; // 0...15 in steps of 1
339
-
340
- const float * y = yy + 128*n + l;
341
- const uint8_t * q = x[ib].qs + 32*n + l;
342
- const uint8_t * s = x[ib].scales + 8*n;
343
-
344
- const float dall = x[ib].d;
345
- const float dmin = x[ib].dmin;
346
-
347
- float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
348
- + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
349
- + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
350
- + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
351
- + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
352
- + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
353
- + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
354
- + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
355
-
356
- result = sum;
357
-
358
- }
359
-
360
371
  static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
361
372
 
362
373
  int r = threadIdx.x/4;
@@ -388,51 +399,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
388
399
 
389
400
  }
390
401
 
391
- static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
392
-
393
- const block_q3_K * x = (const block_q3_K *) vx;
394
-
395
- const uint32_t kmask1 = 0x03030303;
396
- const uint32_t kmask2 = 0x0f0f0f0f;
397
-
398
- uint32_t aux[3];
399
- uint32_t utmp[4];
400
-
401
- // if n is 0, we want to do the lower 128, else the upper 128,
402
- // covering y[l+0], y[l+32], y[l+64], y[l+96] and
403
- // y[l+16], y[l+48], y[l+80], y[l+112]
404
- int n = iqs/128; // 0 or 1
405
- int r = iqs - 128*n; // 0...120 in steps of 8
406
- int l = r/8; // 0...15 in steps of 1
407
-
408
- const float * y = yy + 128*n + l;
409
- const uint8_t * q = x[ib].qs + 32*n + l;
410
- const uint8_t * hm = x[ib].hmask + l;
411
- const int8_t * s = (const int8_t *)utmp + 8*n;
412
-
413
- memcpy(aux, x[ib].scales, 12);
414
- utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
415
- utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
416
- utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
417
- utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
418
-
419
- const float dall = x[ib].d;
420
-
421
- const uint8_t m = 1 << (4*n);
422
-
423
- float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
424
- + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
425
- + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
426
- + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
427
- + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
428
- + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
429
- + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
430
- + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
431
-
432
- result = sum * dall;
433
-
434
- }
435
-
436
402
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
437
403
  if (j < 4) {
438
404
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -479,38 +445,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
479
445
  }
480
446
  }
481
447
 
482
- static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
483
-
484
- const block_q4_K * x = (const block_q4_K *) vx;
485
-
486
- // iqs is in 0...248 in steps of 8 =>
487
- const int j = iqs / 64; // j is in 0...3
488
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
489
- const int is = 2*j; // is is in 0...6 in steps of 2
490
-
491
- const float * y = yy + 64*j + ir;
492
- const uint8_t * q = x[ib].qs + 32*j + ir;
493
-
494
- const float dall = x[ib].d;
495
- const float dmin = x[ib].dmin;
496
-
497
- uint8_t sc, m;
498
- get_scale_min_k4(is + 0, x[ib].scales, sc, m);
499
- const float d1 = dall * sc;
500
- const float m1 = dmin * m;
501
- get_scale_min_k4(is + 1, x[ib].scales, sc, m);
502
- const float d2 = dall * sc;
503
- const float m2 = dmin * m;
504
-
505
- float sum = 0;
506
- for (int k = 0; k < 4; ++k) {
507
- sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
508
- sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
509
- }
510
- result = sum;
511
-
512
- }
513
-
514
448
  static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
515
449
  const block_q5_K * x = (const block_q5_K *) vx;
516
450
 
@@ -544,43 +478,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
544
478
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
545
479
  }
546
480
 
547
- static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
548
-
549
- const block_q5_K * x = (const block_q5_K *) vx;
550
-
551
- // iqs is in 0...248 in steps of 8 =>
552
- const int j = iqs / 64; // j is in 0...3
553
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
554
- const int is = 2*j; // is is in 0...6 in steps of 2
555
-
556
- const float * y = yy + 64*j + ir;
557
- const uint8_t * ql = x[ib].qs + 32*j + ir;
558
- const uint8_t * qh = x[ib].qh + ir;
559
-
560
- const float dall = x[ib].d;
561
- const float dmin = x[ib].dmin;
562
-
563
- uint8_t sc, m;
564
- get_scale_min_k4(is + 0, x[ib].scales, sc, m);
565
- const float d1 = dall * sc;
566
- const float m1 = dmin * m;
567
- get_scale_min_k4(is + 1, x[ib].scales, sc, m);
568
- const float d2 = dall * sc;
569
- const float m2 = dmin * m;
570
-
571
- uint8_t hm = 1 << is;
572
- float sum = 0;
573
- for (int k = 0; k < 4; ++k) {
574
- sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
575
- }
576
- hm <<= 1;
577
- for (int k = 0; k < 4; ++k) {
578
- sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
579
- }
580
- result = sum;
581
-
582
- }
583
-
584
481
  static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
585
482
  const block_q6_K * x = (const block_q6_K *) vx;
586
483
 
@@ -606,38 +503,395 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
606
503
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
607
504
  }
608
505
 
609
- static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
506
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
610
507
 
611
- const block_q6_K * x = (const block_q6_K *) vx;
508
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
509
+
510
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
511
+ if (row > nrows) return;
512
+
513
+ const int num_blocks_per_row = ncols / QK_K;
514
+ const int ib0 = row*num_blocks_per_row;
515
+
516
+ const block_q2_K * x = (const block_q2_K *)vx + ib0;
517
+
518
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
519
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
520
+
521
+ const int step = 16/K_QUANTS_PER_ITERATION;
522
+
523
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
524
+ const int in = tid - step*im; // 0...15 or 0...7
525
+
526
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
527
+ const int q_offset = 32*im + l0;
528
+ const int s_offset = 8*im;
529
+ const int y_offset = 128*im + l0;
530
+
531
+ float tmp = 0; // partial sum for thread in warp
532
+
533
+ uint32_t aux[4];
534
+ const uint8_t * d = (const uint8_t *)aux;
535
+ const uint8_t * m = (const uint8_t *)(aux + 2);
536
+
537
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
538
+
539
+ const float * y = yy + i * QK_K + y_offset;
540
+ const uint8_t * q = x[i].qs + q_offset;
541
+
542
+ const float dall = x[i].d;
543
+ const float dmin = x[i].dmin;
544
+
545
+ const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
546
+ aux[0] = a[0] & 0x0f0f0f0f;
547
+ aux[1] = a[1] & 0x0f0f0f0f;
548
+ aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
549
+ aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
550
+
551
+ float sum1 = 0, sum2 = 0;
552
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
553
+ sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
554
+ + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
555
+ + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
556
+ + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
557
+ + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
558
+ + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
559
+ + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
560
+ +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
561
+ sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
562
+ + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
563
+
564
+ }
565
+ tmp += dall * sum1 - dmin * sum2;
566
+
567
+ }
568
+
569
+ // sum up partial sums and write back result
570
+ __syncthreads();
571
+ #pragma unroll
572
+ for (int mask = 16; mask > 0; mask >>= 1) {
573
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
574
+ }
575
+
576
+ if (tid == 0) {
577
+ dst[row] = tmp;
578
+ }
579
+ }
580
+
581
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
582
+
583
+ const uint16_t kmask1 = 0x0303;
584
+ const uint16_t kmask2 = 0x0f0f;
585
+
586
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
587
+ if (row > nrows) return;
588
+
589
+ const int num_blocks_per_row = ncols / QK_K;
590
+ const int ib0 = row*num_blocks_per_row;
591
+
592
+ const block_q3_K * x = (const block_q3_K *)vx + ib0;
593
+
594
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
595
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
596
+
597
+ const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
598
+ const int step = 16/K_QUANTS_PER_ITERATION;
599
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
600
+ const int in = tid - step*im; // 0....15 or 0...7
601
+
602
+ const uint8_t m = 1 << (4*im);
603
+
604
+ const int l0 = n*in; // 0...15 or 0...14 in steps of 2
605
+ const int q_offset = 32*im + l0;
606
+ const int y_offset = 128*im + l0;
607
+
608
+ uint16_t utmp[4];
609
+ const int8_t * s = (const int8_t *)utmp;
610
+
611
+ const uint16_t s_shift = 4*im;
612
+
613
+ float tmp = 0; // partial sum for thread in warp
614
+
615
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
616
+
617
+ const float * y = yy + i * QK_K + y_offset;
618
+ const uint8_t * q = x[i].qs + q_offset;
619
+ const uint8_t * h = x[i].hmask + l0;
620
+
621
+ const uint16_t * a = (const uint16_t *)x[i].scales;
622
+ utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
623
+ utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
624
+ utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
625
+ utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
626
+
627
+ const float d = x[i].d;
628
+
629
+ float sum = 0;
630
+ for (int l = 0; l < n; ++l) {
631
+ sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
632
+ + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
633
+ + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
634
+ + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
635
+ sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
636
+ + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
637
+ + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
638
+ + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
639
+ }
640
+ tmp += d * sum;
641
+
642
+ }
643
+
644
+ // sum up partial sums and write back result
645
+ __syncthreads();
646
+ #pragma unroll
647
+ for (int mask = 16; mask > 0; mask >>= 1) {
648
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
649
+ }
650
+
651
+ if (tid == 0) {
652
+ dst[row] = tmp;
653
+ }
654
+ }
655
+
656
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
657
+
658
+ const uint16_t kmask1 = 0x3f3f;
659
+ const uint16_t kmask2 = 0x0f0f;
660
+ const uint16_t kmask3 = 0xc0c0;
661
+
662
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
663
+ if (row > nrows) return;
664
+ const int num_blocks_per_row = ncols / QK_K;
665
+ const int ib0 = row*num_blocks_per_row;
666
+
667
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
668
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
669
+
670
+ const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
671
+
672
+ const int il = tid/step; // 0...3
673
+ const int ir = tid - step*il; // 0...7 or 0...3
674
+ const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
675
+
676
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
677
+ const int in = il%2;
678
+
679
+ const int l0 = n*(2*ir + in);
680
+ const int q_offset = 32*im + l0;
681
+ const int y_offset = 64*im + l0;
682
+
683
+ uint16_t aux[4];
684
+ const uint8_t * sc = (const uint8_t *)aux;
685
+
686
+ const block_q4_K * x = (const block_q4_K *)vx + ib0;
687
+
688
+ float tmp = 0; // partial sum for thread in warp
612
689
 
613
- const int ip = iqs / 128; // 0 or 1
614
- const int il = (iqs - 128*ip)/8; // 0...15
615
- const int is = 8*ip;
690
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
616
691
 
617
- const float * y = yy + 128*ip + il;
692
+ const uint8_t * q1 = x[i].qs + q_offset;
693
+ const uint8_t * q2 = q1 + 64;
694
+ const float * y1 = yy + i*QK_K + y_offset;
695
+ const float * y2 = y1 + 128;
618
696
 
619
- const float d = x[ib].d;
697
+ const float dall = x[i].d;
698
+ const float dmin = x[i].dmin;
620
699
 
621
- const uint8_t * ql = x[ib].ql + 64*ip + il;
622
- const uint8_t * qh = x[ib].qh + 32*ip + il;
623
- const int8_t * sc = x[ib].scales + is;
700
+ const uint16_t * a = (const uint16_t *)x[i].scales;
701
+ aux[0] = a[im+0] & kmask1;
702
+ aux[1] = a[im+2] & kmask1;
703
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
704
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
624
705
 
625
- result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
626
- + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
627
- + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
628
- + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
629
- + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
630
- + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
631
- + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
632
- + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
706
+ float4 s = {0.f, 0.f, 0.f, 0.f};
707
+ float smin = 0;
708
+ for (int l = 0; l < n; ++l) {
709
+ s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
710
+ s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
711
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
712
+ }
713
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
714
+
715
+ }
716
+
717
+ // sum up partial sums and write back result
718
+ __syncthreads();
719
+ #pragma unroll
720
+ for (int mask = 16; mask > 0; mask >>= 1) {
721
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
722
+ }
723
+
724
+ if (tid == 0) {
725
+ dst[row] = tmp;
726
+ }
727
+ }
728
+
729
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
730
+
731
+ const uint16_t kmask1 = 0x3f3f;
732
+ const uint16_t kmask2 = 0x0f0f;
733
+ const uint16_t kmask3 = 0xc0c0;
734
+
735
+ //const int row = blockIdx.x*blockDim.y + threadIdx.y;
736
+ const int row = blockIdx.x;
737
+ const int num_blocks_per_row = ncols / QK_K;
738
+ const int ib0 = row*num_blocks_per_row;
739
+
740
+ const int tid = threadIdx.x/2; // 0...15
741
+ const int ix = threadIdx.x%2;
742
+
743
+ const int il = tid/4; // 0...3
744
+ const int ir = tid - 4*il;// 0...3
745
+ const int n = 2;
746
+
747
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
748
+ const int in = il%2;
749
+
750
+ const int l0 = n*(2*ir + in);
751
+ const int q_offset = 32*im + l0;
752
+ const int y_offset = 64*im + l0;
753
+
754
+ const uint8_t hm1 = 1 << (2*im);
755
+ const uint8_t hm2 = hm1 << 4;
756
+
757
+ uint16_t aux[4];
758
+ const uint8_t * sc = (const uint8_t *)aux;
759
+
760
+ const block_q5_K * x = (const block_q5_K *)vx + ib0;
761
+
762
+ float tmp = 0; // partial sum for thread in warp
763
+
764
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
765
+
766
+ const uint8_t * ql1 = x[i].qs + q_offset;
767
+ const uint8_t * ql2 = ql1 + 64;
768
+ const uint8_t * qh = x[i].qh + l0;
769
+ const float * y1 = yy + i*QK_K + y_offset;
770
+ const float * y2 = y1 + 128;
771
+
772
+ const float dall = x[i].d;
773
+ const float dmin = x[i].dmin;
774
+
775
+ const uint16_t * a = (const uint16_t *)x[i].scales;
776
+ aux[0] = a[im+0] & kmask1;
777
+ aux[1] = a[im+2] & kmask1;
778
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
779
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
780
+
781
+ float4 sum = {0.f, 0.f, 0.f, 0.f};
782
+ float smin = 0;
783
+ for (int l = 0; l < n; ++l) {
784
+ sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
785
+ + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
786
+ sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
787
+ + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
788
+ sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
789
+ + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
790
+ sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
791
+ + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
792
+ smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
793
+ + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
794
+ }
795
+ tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
796
+
797
+ }
798
+
799
+ // sum up partial sums and write back result
800
+ __syncthreads();
801
+ #pragma unroll
802
+ for (int mask = 16; mask > 0; mask >>= 1) {
803
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
804
+ }
805
+
806
+ if (tid == 0) {
807
+ dst[row] = tmp;
808
+ }
809
+ }
810
+
811
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
812
+
813
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
814
+
815
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
816
+ if (row > nrows) return;
817
+
818
+ const int num_blocks_per_row = ncols / QK_K;
819
+ const int ib0 = row*num_blocks_per_row;
820
+
821
+ const block_q6_K * x = (const block_q6_K *)vx + ib0;
822
+
823
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
824
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
825
+
826
+ const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
827
+
828
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
829
+ const int in = tid - step*im; // 0...15 or 0...7
830
+
831
+ #if K_QUANTS_PER_ITERATION == 1
832
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
833
+ const int is = 0;
834
+ #else
835
+ const int l0 = 4 * in; // 0, 4, 8, ..., 28
836
+ const int is = in / 4;
837
+ #endif
838
+ const int ql_offset = 64*im + l0;
839
+ const int qh_offset = 32*im + l0;
840
+ const int s_offset = 8*im + is;
841
+ const int y_offset = 128*im + l0;
842
+
843
+ float tmp = 0; // partial sum for thread in warp
844
+
845
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
846
+
847
+ const float * y = yy + i * QK_K + y_offset;
848
+ const uint8_t * ql = x[i].ql + ql_offset;
849
+ const uint8_t * qh = x[i].qh + qh_offset;
850
+ const int8_t * s = x[i].scales + s_offset;
851
+
852
+ const float d = x[i].d;
853
+
854
+ #if K_QUANTS_PER_ITERATION == 1
855
+ float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
856
+ + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
857
+ + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
858
+ + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
859
+ + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
860
+ + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
861
+ + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
862
+ +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
863
+ tmp += sum;
864
+ #else
865
+ float sum = 0;
866
+ for (int l = 0; l < 4; ++l) {
867
+ sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
868
+ + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
869
+ + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
870
+ + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
871
+ }
872
+ tmp += sum;
873
+ #endif
874
+
875
+ }
876
+
877
+ // sum up partial sums and write back result
878
+ __syncthreads();
879
+ #pragma unroll
880
+ for (int mask = 16; mask > 0; mask >>= 1) {
881
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
882
+ }
633
883
 
884
+ if (tid == 0) {
885
+ dst[row] = tmp;
886
+ }
634
887
  }
635
888
 
636
- static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
889
+ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
637
890
  const half * x = (const half *) vx;
638
891
 
639
- v0 = __half2float(x[ib + iqs + 0]);
640
- v1 = __half2float(x[ib + iqs + 1]);
892
+ // automatic half -> float type cast if dfloat == float
893
+ v.x = x[ib + iqs + 0];
894
+ v.y = x[ib + iqs + 1];
641
895
  }
642
896
 
643
897
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -654,13 +908,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
654
908
  const int y_offset = qr == 1 ? 1 : qk/2;
655
909
 
656
910
  // dequantize
657
- float & v0 = y[iybs + iqs + 0];
658
- float & v1 = y[iybs + iqs + y_offset];
659
- dequantize_kernel(vx, ib, iqs, v0, v1);
911
+ dfloat2 v;
912
+ dequantize_kernel(vx, ib, iqs, v);
913
+
914
+ y[iybs + iqs + 0] = v.x;
915
+ y[iybs + iqs + y_offset] = v.y;
660
916
  }
661
917
 
662
918
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
663
- static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
919
+ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
664
920
  // qk = quantized weights per x block
665
921
  // qr = number of quantized weights per data value in x block
666
922
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -675,7 +931,12 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
675
931
  const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
676
932
  const int y_offset = qr == 1 ? 1 : qk/2;
677
933
 
678
- float tmp = 0.0f; // partial sum for thread in warp
934
+ // partial sum for each thread
935
+ #ifdef GGML_CUDA_DMMV_F16
936
+ half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
937
+ #else
938
+ float tmp = 0.0f;
939
+ #endif // GGML_CUDA_DMMV_F16
679
940
 
680
941
  for (int i = 0; i < ncols; i += iter_stride) {
681
942
  const int col = i + vals_per_iter*tid;
@@ -689,14 +950,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
689
950
  // process 2 vals per j iter
690
951
 
691
952
  // dequantize
692
- float v0, v1;
693
- dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
694
953
  // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
954
+ dfloat2 v;
955
+ dequantize_kernel(vx, ib, iqs + j/qr, v);
695
956
 
696
957
  // matrix multiplication
697
- tmp += v0 * y[iybs + iqs + j/qr + 0];
698
- tmp += v1 * y[iybs + iqs + j/qr + y_offset];
699
958
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
959
+ #ifdef GGML_CUDA_DMMV_F16
960
+ tmp += __hmul2(v, {
961
+ y[iybs + iqs + j/qr + 0],
962
+ y[iybs + iqs + j/qr + y_offset]
963
+ });
964
+ #else
965
+ tmp += v.x * y[iybs + iqs + j/qr + 0];
966
+ tmp += v.y * y[iybs + iqs + j/qr + y_offset];
967
+ #endif // GGML_CUDA_DMMV_F16
700
968
  }
701
969
  }
702
970
 
@@ -708,47 +976,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
708
976
  }
709
977
 
710
978
  if (tid == 0) {
979
+ #ifdef GGML_CUDA_DMMV_F16
980
+ dst[row] = tmp.x + tmp.y;
981
+ #else
711
982
  dst[row] = tmp;
712
- }
713
- }
714
-
715
- template <int n_thread, dot_kernel_k_t dot_kernel>
716
- static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
717
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
718
-
719
- if (row >= nrows) {
720
- return;
721
- }
722
-
723
- const int tid = threadIdx.x;
724
-
725
- const int iter_stride = QK_K;
726
- const int vals_per_iter = iter_stride / n_thread;
727
- const int num_blocks_per_row = ncols / QK_K;
728
- const int ib0 = row*num_blocks_per_row;
729
-
730
- float tmp = 0; // partial sum for thread in warp
731
-
732
- for (int i = 0; i < ncols; i += iter_stride) {
733
- const int col = i + vals_per_iter*tid;
734
- const int ib = ib0 + col/QK_K; // x block index
735
- const int iqs = col%QK_K; // x quant index
736
- const int iybs = col - col%QK_K; // y block start index
737
-
738
- float v;
739
- dot_kernel(vx, ib, iqs, y + iybs, v);
740
- tmp += v;
741
- }
742
-
743
- // sum up partial sums and write back result
744
- __syncthreads();
745
- #pragma unroll
746
- for (int mask = 16; mask > 0; mask >>= 1) {
747
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
748
- }
749
-
750
- if (tid == 0) {
751
- dst[row] = tmp;
983
+ #endif // GGML_CUDA_DMMV_F16
752
984
  }
753
985
  }
754
986
 
@@ -1043,7 +1275,7 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
1043
1275
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
1044
1276
  }
1045
1277
 
1046
- static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1278
+ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1047
1279
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1048
1280
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1049
1281
  const dim3 block_nums(1, block_num_y, 1);
@@ -1052,7 +1284,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, f
1052
1284
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1053
1285
  }
1054
1286
 
1055
- static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1287
+ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1056
1288
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1057
1289
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1058
1290
  const dim3 block_nums(1, block_num_y, 1);
@@ -1061,7 +1293,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, f
1061
1293
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1062
1294
  }
1063
1295
 
1064
- static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1296
+ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1065
1297
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1066
1298
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1067
1299
  const dim3 block_nums(1, block_num_y, 1);
@@ -1070,7 +1302,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, f
1070
1302
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1071
1303
  }
1072
1304
 
1073
- static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1305
+ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1074
1306
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1075
1307
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1076
1308
  const dim3 block_nums(1, block_num_y, 1);
@@ -1079,7 +1311,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, f
1079
1311
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1080
1312
  }
1081
1313
 
1082
- static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1314
+ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1083
1315
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1084
1316
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1085
1317
  const dim3 block_nums(1, block_num_y, 1);
@@ -1090,47 +1322,44 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
1090
1322
 
1091
1323
  static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1092
1324
  GGML_ASSERT(ncols % QK_K == 0);
1093
- const int ny = 2;
1325
+ const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
1094
1326
  const int block_num_y = (nrows + ny - 1) / ny;
1095
1327
  const dim3 block_nums(1, block_num_y, 1);
1096
1328
  const dim3 block_dims(32, ny, 1);
1097
- dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1329
+ dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1098
1330
  }
1099
1331
 
1100
1332
  static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1101
1333
  GGML_ASSERT(ncols % QK_K == 0);
1102
- const int ny = 2;
1334
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1103
1335
  const int block_num_y = (nrows + ny - 1) / ny;
1104
1336
  const dim3 block_nums(1, block_num_y, 1);
1105
1337
  const dim3 block_dims(32, ny, 1);
1106
- dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1338
+ dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1107
1339
  }
1108
1340
 
1109
1341
  static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1110
1342
  GGML_ASSERT(ncols % QK_K == 0);
1111
- const int ny = 2;
1343
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1112
1344
  const int block_num_y = (nrows + ny - 1) / ny;
1113
1345
  const dim3 block_nums(1, block_num_y, 1);
1114
1346
  const dim3 block_dims(32, ny, 1);
1115
- dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1347
+ dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1116
1348
  }
1117
1349
 
1118
1350
  static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1119
1351
  GGML_ASSERT(ncols % QK_K == 0);
1120
- const int ny = 2;
1121
- const int block_num_y = (nrows + ny - 1) / ny;
1122
- const dim3 block_nums(1, block_num_y, 1);
1123
- const dim3 block_dims(32, ny, 1);
1124
- dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1352
+ const dim3 block_dims(32, 1, 1);
1353
+ dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
1125
1354
  }
1126
1355
 
1127
1356
  static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1128
1357
  GGML_ASSERT(ncols % QK_K == 0);
1129
- const int ny = 2;
1358
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1130
1359
  const int block_num_y = (nrows + ny - 1) / ny;
1131
1360
  const dim3 block_nums(1, block_num_y, 1);
1132
1361
  const dim3 block_dims(32, ny, 1);
1133
- dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1362
+ dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1134
1363
  }
1135
1364
 
1136
1365
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -1138,7 +1367,7 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
1138
1367
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
1139
1368
  }
1140
1369
 
1141
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1370
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1142
1371
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1143
1372
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1144
1373
  const dim3 block_nums(1, block_num_y, 1);
@@ -1306,19 +1535,13 @@ static void * g_scratch_buffer = nullptr;
1306
1535
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
1307
1536
  static size_t g_scratch_offset = 0;
1308
1537
 
1309
- #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1310
- #define GGML_CUDA_MAX_EVENTS 64
1311
-
1312
1538
  static int g_device_count = -1;
1313
1539
  static int g_main_device = 0;
1314
1540
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1315
1541
 
1316
1542
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
1317
1543
 
1318
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1319
-
1320
- static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1321
- static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1544
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
1322
1545
 
1323
1546
  void ggml_init_cublas() {
1324
1547
  static bool initialized = false;
@@ -1342,15 +1565,8 @@ void ggml_init_cublas() {
1342
1565
  for (int id = 0; id < g_device_count; ++id) {
1343
1566
  CUDA_CHECK(cudaSetDevice(id));
1344
1567
 
1345
- // create streams
1346
- for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
1347
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1348
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1349
- }
1350
- // create events
1351
- for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
1352
- CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1353
- }
1568
+ // create main stream
1569
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
1354
1570
 
1355
1571
  // create cublas handle
1356
1572
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -1566,21 +1782,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1566
1782
  const int64_t ne00 = src0->ne[0];
1567
1783
  const int64_t nrows = i01_high - i01_low;
1568
1784
 
1785
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
1786
+ #ifdef GGML_CUDA_DMMV_F16
1787
+ size_t ash;
1788
+ dfloat * src1_dfloat = nullptr; // dfloat == half
1789
+
1790
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
1791
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
1792
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
1793
+
1794
+ if (src1_convert_f16) {
1795
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
1796
+ ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
1797
+ ne00, 1, sizeof(float), 0, 0,
1798
+ ne00, 1, sizeof(half), 0, 0, cudaStream_main);
1799
+ }
1800
+ #else
1801
+ dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
1802
+ #endif // GGML_CUDA_DMMV_F16
1803
+
1569
1804
  switch (src0->type) {
1570
1805
  case GGML_TYPE_Q4_0:
1571
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1806
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1572
1807
  break;
1573
1808
  case GGML_TYPE_Q4_1:
1574
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1809
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1575
1810
  break;
1576
1811
  case GGML_TYPE_Q5_0:
1577
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1812
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1578
1813
  break;
1579
1814
  case GGML_TYPE_Q5_1:
1580
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1815
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1581
1816
  break;
1582
1817
  case GGML_TYPE_Q8_0:
1583
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1818
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1584
1819
  break;
1585
1820
  case GGML_TYPE_Q2_K:
1586
1821
  dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
@@ -1598,7 +1833,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1598
1833
  dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1599
1834
  break;
1600
1835
  case GGML_TYPE_F16:
1601
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1836
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1602
1837
  break;
1603
1838
  default:
1604
1839
  GGML_ASSERT(false);
@@ -1606,6 +1841,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1606
1841
  }
1607
1842
  CUDA_CHECK(cudaGetLastError());
1608
1843
 
1844
+ #ifdef GGML_CUDA_DMMV_F16
1845
+ if (src1_convert_f16) {
1846
+ ggml_cuda_pool_free(src1_dfloat, ash);
1847
+ }
1848
+ #endif // GGML_CUDA_DMMV_F16
1849
+
1609
1850
  (void) src1;
1610
1851
  (void) dst;
1611
1852
  (void) src0_ddf_i;
@@ -1817,6 +2058,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1817
2058
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
1818
2059
  size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
1819
2060
 
2061
+ // if multiple GPUs are used they need to wait for the main GPU to finish
2062
+ if (split && g_device_count > 1) {
2063
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2064
+ CUDA_CHECK(cudaDeviceSynchronize());
2065
+ }
2066
+
1820
2067
  for (int id = 0; id < g_device_count; ++id) {
1821
2068
  if (!split && id != g_main_device) {
1822
2069
  continue;
@@ -1915,9 +2162,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1915
2162
  }
1916
2163
  const int64_t i11 = i13*ne12 + i12;
1917
2164
 
1918
- cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
1919
- cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
1920
- cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2165
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
1921
2166
 
1922
2167
  // for split tensors the data begins at i0 == i0_offset_low
1923
2168
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
@@ -1945,14 +2190,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1945
2190
  if (src1->backend == GGML_BACKEND_CPU) {
1946
2191
  GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
1947
2192
  int64_t nrows1 = flatten_rows ? nrows0 : ne11;
1948
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1));
2193
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
1949
2194
  } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
1950
2195
  if (id != g_main_device) {
1951
2196
  GGML_ASSERT(!flatten_rows);
1952
2197
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
1953
2198
  src1_ddf_i_source += i11*src1_stride;
1954
2199
  CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
1955
- cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
2200
+ cudaMemcpyDeviceToDevice, cudaStream_main));
1956
2201
  }
1957
2202
  } else if (src1_on_device && !src1_is_contiguous) {
1958
2203
  GGML_ASSERT(!split);
@@ -1961,7 +2206,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1961
2206
  GGML_ASSERT(false);
1962
2207
  }
1963
2208
  }
1964
- CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
1965
2209
 
1966
2210
  if (!src0_on_device || !src0_is_contiguous) {
1967
2211
  if (src0_is_f32) {
@@ -1977,9 +2221,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1977
2221
  CUDA_CHECK(cudaGetLastError());
1978
2222
  }
1979
2223
 
1980
- // wait with main stream until src1 memcpy is done
1981
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
1982
-
1983
2224
  // do the computation
1984
2225
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
1985
2226
 
@@ -2017,8 +2258,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2017
2258
 
2018
2259
  // wait until each device is finished, then free their buffers
2019
2260
  for (int id = 0; id < g_device_count; ++id) {
2261
+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
2262
+ continue;
2263
+ }
2264
+
2020
2265
  CUDA_CHECK(cudaSetDevice(id));
2021
2266
  CUDA_CHECK(cudaDeviceSynchronize());
2267
+
2022
2268
  if (src0_asq[id] > 0) {
2023
2269
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
2024
2270
  }
@@ -2084,7 +2330,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2084
2330
  const int64_t ne02 = src0->ne[2];
2085
2331
 
2086
2332
  CUDA_CHECK(cudaSetDevice(g_main_device));
2087
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2333
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2088
2334
 
2089
2335
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2090
2336
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -2096,8 +2342,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2096
2342
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2097
2343
 
2098
2344
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2099
-
2100
- CUDA_CHECK(cudaDeviceSynchronize());
2101
2345
  }
2102
2346
 
2103
2347
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -2115,7 +2359,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2115
2359
  const int64_t nb02 = src0->nb[2];
2116
2360
 
2117
2361
  CUDA_CHECK(cudaSetDevice(g_main_device));
2118
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2362
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2119
2363
 
2120
2364
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2121
2365
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -2130,8 +2374,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2130
2374
  const int channel_stride_x = nb02 / sizeof(half);
2131
2375
 
2132
2376
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2133
-
2134
- CUDA_CHECK(cudaDeviceSynchronize());
2135
2377
  }
2136
2378
 
2137
2379
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2187,7 +2429,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2187
2429
  const int64_t nb12 = src1->nb[2];
2188
2430
 
2189
2431
  CUDA_CHECK(cudaSetDevice(g_main_device));
2190
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2432
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2191
2433
 
2192
2434
  const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2193
2435
  const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -2205,8 +2447,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2205
2447
  GGML_ASSERT(false);
2206
2448
  }
2207
2449
 
2208
- CUDA_CHECK(cudaDeviceSynchronize());
2209
-
2210
2450
  (void) dst;
2211
2451
  }
2212
2452