llama_cpp 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,10 @@
13
13
  #include "ggml-cuda.h"
14
14
  #include "ggml.h"
15
15
 
16
+ #if defined(_MSC_VER)
17
+ #pragma warning(disable: 4244 4267) // possible loss of data
18
+ #endif
19
+
16
20
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
17
21
 
18
22
  #define CUDA_CHECK(err) \
@@ -46,7 +50,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
46
50
  } while (0)
47
51
  #endif // CUDART_VERSION >= 11
48
52
 
49
- typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
53
+ #ifdef GGML_CUDA_DMMV_F16
54
+ typedef half dfloat; // dequantize float
55
+ typedef half2 dfloat2;
56
+ #else
57
+ typedef float dfloat; // dequantize float
58
+ typedef float2 dfloat2;
59
+ #endif //GGML_CUDA_DMMV_F16
60
+
61
+ typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
50
62
  typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
51
63
  typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
52
64
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
@@ -167,6 +179,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
167
179
  #define GGML_CUDA_DMMV_Y 1
168
180
  #endif
169
181
 
182
+ #ifndef K_QUANTS_PER_ITERATION
183
+ #define K_QUANTS_PER_ITERATION 2
184
+ #else
185
+ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
186
+ #endif
187
+
170
188
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
171
189
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
172
190
 
@@ -224,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
224
242
  }
225
243
  }
226
244
 
227
- static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
245
+ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
228
246
  const block_q4_0 * x = (const block_q4_0 *) vx;
229
247
 
230
- const float d = x[ib].d;
248
+ const dfloat d = x[ib].d;
231
249
 
232
- const uint8_t vui = x[ib].qs[iqs];
250
+ const int vui = x[ib].qs[iqs];
233
251
 
234
- const int8_t vi0 = vui & 0xF;
235
- const int8_t vi1 = vui >> 4;
252
+ v.x = vui & 0xF;
253
+ v.y = vui >> 4;
236
254
 
237
- v0 = (vi0 - 8)*d;
238
- v1 = (vi1 - 8)*d;
255
+ #ifdef GGML_CUDA_DMMV_F16
256
+ v = __hsub2(v, {8.0f, 8.0f});
257
+ v = __hmul2(v, {d, d});
258
+ #else
259
+ v.x = (v.x - 8.0f) * d;
260
+ v.y = (v.y - 8.0f) * d;
261
+ #endif // GGML_CUDA_DMMV_F16
239
262
  }
240
263
 
241
- static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
264
+ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
242
265
  const block_q4_1 * x = (const block_q4_1 *) vx;
243
266
 
244
- const float d = x[ib].d;
245
- const float m = x[ib].m;
267
+ const dfloat d = x[ib].d;
268
+ const dfloat m = x[ib].m;
246
269
 
247
- const uint8_t vui = x[ib].qs[iqs];
270
+ const int vui = x[ib].qs[iqs];
248
271
 
249
- const int8_t vi0 = vui & 0xF;
250
- const int8_t vi1 = vui >> 4;
272
+ v.x = vui & 0xF;
273
+ v.y = vui >> 4;
251
274
 
252
- v0 = vi0*d + m;
253
- v1 = vi1*d + m;
275
+ #ifdef GGML_CUDA_DMMV_F16
276
+ v = __hmul2(v, {d, d});
277
+ v = __hadd2(v, {m, m});
278
+ #else
279
+ v.x = (v.x * d) + m;
280
+ v.y = (v.y * d) + m;
281
+ #endif // GGML_CUDA_DMMV_F16
254
282
  }
255
283
 
256
- static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
284
+ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
257
285
  const block_q5_0 * x = (const block_q5_0 *) vx;
258
286
 
259
- const float d = x[ib].d;
287
+ const dfloat d = x[ib].d;
260
288
 
261
289
  uint32_t qh;
262
290
  memcpy(&qh, x[ib].qh, sizeof(qh));
263
291
 
264
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
265
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
292
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
293
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
266
294
 
267
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
268
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
295
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
296
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
269
297
 
270
- v0 = x0*d;
271
- v1 = x1*d;
298
+ #ifdef GGML_CUDA_DMMV_F16
299
+ v = __hsub2(v, {16.0f, 16.0f});
300
+ v = __hmul2(v, {d, d});
301
+ #else
302
+ v.x = (v.x - 16.0f) * d;
303
+ v.y = (v.y - 16.0f) * d;
304
+ #endif // GGML_CUDA_DMMV_F16
272
305
  }
273
306
 
274
- static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
307
+ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
275
308
  const block_q5_1 * x = (const block_q5_1 *) vx;
276
309
 
277
- const float d = x[ib].d;
278
- const float m = x[ib].m;
310
+ const dfloat d = x[ib].d;
311
+ const dfloat m = x[ib].m;
279
312
 
280
313
  uint32_t qh;
281
314
  memcpy(&qh, x[ib].qh, sizeof(qh));
282
315
 
283
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
284
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
316
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
317
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
285
318
 
286
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
287
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
319
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
320
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
288
321
 
289
- v0 = x0*d + m;
290
- v1 = x1*d + m;
322
+ #ifdef GGML_CUDA_DMMV_F16
323
+ v = __hmul2(v, {d, d});
324
+ v = __hadd2(v, {m, m});
325
+ #else
326
+ v.x = (v.x * d) + m;
327
+ v.y = (v.y * d) + m;
328
+ #endif // GGML_CUDA_DMMV_F16
291
329
  }
292
330
 
293
- static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
331
+ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
294
332
  const block_q8_0 * x = (const block_q8_0 *) vx;
295
333
 
296
- const float d = x[ib].d;
334
+ const dfloat d = x[ib].d;
297
335
 
298
- const int8_t vi0 = x[ib].qs[iqs + 0];
299
- const int8_t vi1 = x[ib].qs[iqs + 1];
336
+ v.x = x[ib].qs[iqs + 0];
337
+ v.y = x[ib].qs[iqs + 1];
300
338
 
301
- v0 = vi0*d;
302
- v1 = vi1*d;
339
+ #ifdef GGML_CUDA_DMMV_F16
340
+ v = __hmul2(v, {d, d});
341
+ #else
342
+ v.x *= d;
343
+ v.y *= d;
344
+ #endif // GGML_CUDA_DMMV_F16
303
345
  }
304
346
 
305
347
  //================================== k-quants
@@ -326,37 +368,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
326
368
 
327
369
  }
328
370
 
329
- static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
330
-
331
- const block_q2_K * x = (const block_q2_K *) vx;
332
-
333
- // if n is 0, we want to do the lower 128, else the upper 128,
334
- // covering y[l+0], y[l+32], y[l+64], y[l+96] and
335
- // y[l+16], y[l+48], y[l+80], y[l+112]
336
- int n = iqs/128; // 0 or 1
337
- int r = iqs - 128*n; // 0...120 in steps of 8
338
- int l = r/8; // 0...15 in steps of 1
339
-
340
- const float * y = yy + 128*n + l;
341
- const uint8_t * q = x[ib].qs + 32*n + l;
342
- const uint8_t * s = x[ib].scales + 8*n;
343
-
344
- const float dall = x[ib].d;
345
- const float dmin = x[ib].dmin;
346
-
347
- float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
348
- + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
349
- + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
350
- + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
351
- + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
352
- + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
353
- + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
354
- + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
355
-
356
- result = sum;
357
-
358
- }
359
-
360
371
  static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
361
372
 
362
373
  int r = threadIdx.x/4;
@@ -388,51 +399,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
388
399
 
389
400
  }
390
401
 
391
- static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
392
-
393
- const block_q3_K * x = (const block_q3_K *) vx;
394
-
395
- const uint32_t kmask1 = 0x03030303;
396
- const uint32_t kmask2 = 0x0f0f0f0f;
397
-
398
- uint32_t aux[3];
399
- uint32_t utmp[4];
400
-
401
- // if n is 0, we want to do the lower 128, else the upper 128,
402
- // covering y[l+0], y[l+32], y[l+64], y[l+96] and
403
- // y[l+16], y[l+48], y[l+80], y[l+112]
404
- int n = iqs/128; // 0 or 1
405
- int r = iqs - 128*n; // 0...120 in steps of 8
406
- int l = r/8; // 0...15 in steps of 1
407
-
408
- const float * y = yy + 128*n + l;
409
- const uint8_t * q = x[ib].qs + 32*n + l;
410
- const uint8_t * hm = x[ib].hmask + l;
411
- const int8_t * s = (const int8_t *)utmp + 8*n;
412
-
413
- memcpy(aux, x[ib].scales, 12);
414
- utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
415
- utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
416
- utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
417
- utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
418
-
419
- const float dall = x[ib].d;
420
-
421
- const uint8_t m = 1 << (4*n);
422
-
423
- float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
424
- + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
425
- + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
426
- + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
427
- + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
428
- + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
429
- + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
430
- + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
431
-
432
- result = sum * dall;
433
-
434
- }
435
-
436
402
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
437
403
  if (j < 4) {
438
404
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -479,38 +445,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
479
445
  }
480
446
  }
481
447
 
482
- static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
483
-
484
- const block_q4_K * x = (const block_q4_K *) vx;
485
-
486
- // iqs is in 0...248 in steps of 8 =>
487
- const int j = iqs / 64; // j is in 0...3
488
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
489
- const int is = 2*j; // is is in 0...6 in steps of 2
490
-
491
- const float * y = yy + 64*j + ir;
492
- const uint8_t * q = x[ib].qs + 32*j + ir;
493
-
494
- const float dall = x[ib].d;
495
- const float dmin = x[ib].dmin;
496
-
497
- uint8_t sc, m;
498
- get_scale_min_k4(is + 0, x[ib].scales, sc, m);
499
- const float d1 = dall * sc;
500
- const float m1 = dmin * m;
501
- get_scale_min_k4(is + 1, x[ib].scales, sc, m);
502
- const float d2 = dall * sc;
503
- const float m2 = dmin * m;
504
-
505
- float sum = 0;
506
- for (int k = 0; k < 4; ++k) {
507
- sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
508
- sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
509
- }
510
- result = sum;
511
-
512
- }
513
-
514
448
  static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
515
449
  const block_q5_K * x = (const block_q5_K *) vx;
516
450
 
@@ -544,43 +478,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
544
478
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
545
479
  }
546
480
 
547
- static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
548
-
549
- const block_q5_K * x = (const block_q5_K *) vx;
550
-
551
- // iqs is in 0...248 in steps of 8 =>
552
- const int j = iqs / 64; // j is in 0...3
553
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
554
- const int is = 2*j; // is is in 0...6 in steps of 2
555
-
556
- const float * y = yy + 64*j + ir;
557
- const uint8_t * ql = x[ib].qs + 32*j + ir;
558
- const uint8_t * qh = x[ib].qh + ir;
559
-
560
- const float dall = x[ib].d;
561
- const float dmin = x[ib].dmin;
562
-
563
- uint8_t sc, m;
564
- get_scale_min_k4(is + 0, x[ib].scales, sc, m);
565
- const float d1 = dall * sc;
566
- const float m1 = dmin * m;
567
- get_scale_min_k4(is + 1, x[ib].scales, sc, m);
568
- const float d2 = dall * sc;
569
- const float m2 = dmin * m;
570
-
571
- uint8_t hm = 1 << is;
572
- float sum = 0;
573
- for (int k = 0; k < 4; ++k) {
574
- sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
575
- }
576
- hm <<= 1;
577
- for (int k = 0; k < 4; ++k) {
578
- sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
579
- }
580
- result = sum;
581
-
582
- }
583
-
584
481
  static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
585
482
  const block_q6_K * x = (const block_q6_K *) vx;
586
483
 
@@ -606,38 +503,395 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
606
503
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
607
504
  }
608
505
 
609
- static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
506
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
610
507
 
611
- const block_q6_K * x = (const block_q6_K *) vx;
508
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
509
+
510
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
511
+ if (row > nrows) return;
512
+
513
+ const int num_blocks_per_row = ncols / QK_K;
514
+ const int ib0 = row*num_blocks_per_row;
515
+
516
+ const block_q2_K * x = (const block_q2_K *)vx + ib0;
517
+
518
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
519
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
520
+
521
+ const int step = 16/K_QUANTS_PER_ITERATION;
522
+
523
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
524
+ const int in = tid - step*im; // 0...15 or 0...7
525
+
526
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
527
+ const int q_offset = 32*im + l0;
528
+ const int s_offset = 8*im;
529
+ const int y_offset = 128*im + l0;
530
+
531
+ float tmp = 0; // partial sum for thread in warp
532
+
533
+ uint32_t aux[4];
534
+ const uint8_t * d = (const uint8_t *)aux;
535
+ const uint8_t * m = (const uint8_t *)(aux + 2);
536
+
537
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
538
+
539
+ const float * y = yy + i * QK_K + y_offset;
540
+ const uint8_t * q = x[i].qs + q_offset;
541
+
542
+ const float dall = x[i].d;
543
+ const float dmin = x[i].dmin;
544
+
545
+ const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
546
+ aux[0] = a[0] & 0x0f0f0f0f;
547
+ aux[1] = a[1] & 0x0f0f0f0f;
548
+ aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
549
+ aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
550
+
551
+ float sum1 = 0, sum2 = 0;
552
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
553
+ sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
554
+ + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
555
+ + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
556
+ + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
557
+ + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
558
+ + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
559
+ + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
560
+ +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
561
+ sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
562
+ + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
563
+
564
+ }
565
+ tmp += dall * sum1 - dmin * sum2;
566
+
567
+ }
568
+
569
+ // sum up partial sums and write back result
570
+ __syncthreads();
571
+ #pragma unroll
572
+ for (int mask = 16; mask > 0; mask >>= 1) {
573
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
574
+ }
575
+
576
+ if (tid == 0) {
577
+ dst[row] = tmp;
578
+ }
579
+ }
580
+
581
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
582
+
583
+ const uint16_t kmask1 = 0x0303;
584
+ const uint16_t kmask2 = 0x0f0f;
585
+
586
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
587
+ if (row > nrows) return;
588
+
589
+ const int num_blocks_per_row = ncols / QK_K;
590
+ const int ib0 = row*num_blocks_per_row;
591
+
592
+ const block_q3_K * x = (const block_q3_K *)vx + ib0;
593
+
594
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
595
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
596
+
597
+ const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
598
+ const int step = 16/K_QUANTS_PER_ITERATION;
599
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
600
+ const int in = tid - step*im; // 0....15 or 0...7
601
+
602
+ const uint8_t m = 1 << (4*im);
603
+
604
+ const int l0 = n*in; // 0...15 or 0...14 in steps of 2
605
+ const int q_offset = 32*im + l0;
606
+ const int y_offset = 128*im + l0;
607
+
608
+ uint16_t utmp[4];
609
+ const int8_t * s = (const int8_t *)utmp;
610
+
611
+ const uint16_t s_shift = 4*im;
612
+
613
+ float tmp = 0; // partial sum for thread in warp
614
+
615
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
616
+
617
+ const float * y = yy + i * QK_K + y_offset;
618
+ const uint8_t * q = x[i].qs + q_offset;
619
+ const uint8_t * h = x[i].hmask + l0;
620
+
621
+ const uint16_t * a = (const uint16_t *)x[i].scales;
622
+ utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
623
+ utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
624
+ utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
625
+ utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
626
+
627
+ const float d = x[i].d;
628
+
629
+ float sum = 0;
630
+ for (int l = 0; l < n; ++l) {
631
+ sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
632
+ + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
633
+ + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
634
+ + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
635
+ sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
636
+ + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
637
+ + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
638
+ + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
639
+ }
640
+ tmp += d * sum;
641
+
642
+ }
643
+
644
+ // sum up partial sums and write back result
645
+ __syncthreads();
646
+ #pragma unroll
647
+ for (int mask = 16; mask > 0; mask >>= 1) {
648
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
649
+ }
650
+
651
+ if (tid == 0) {
652
+ dst[row] = tmp;
653
+ }
654
+ }
655
+
656
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
657
+
658
+ const uint16_t kmask1 = 0x3f3f;
659
+ const uint16_t kmask2 = 0x0f0f;
660
+ const uint16_t kmask3 = 0xc0c0;
661
+
662
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
663
+ if (row > nrows) return;
664
+ const int num_blocks_per_row = ncols / QK_K;
665
+ const int ib0 = row*num_blocks_per_row;
666
+
667
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
668
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
669
+
670
+ const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
671
+
672
+ const int il = tid/step; // 0...3
673
+ const int ir = tid - step*il; // 0...7 or 0...3
674
+ const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
675
+
676
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
677
+ const int in = il%2;
678
+
679
+ const int l0 = n*(2*ir + in);
680
+ const int q_offset = 32*im + l0;
681
+ const int y_offset = 64*im + l0;
682
+
683
+ uint16_t aux[4];
684
+ const uint8_t * sc = (const uint8_t *)aux;
685
+
686
+ const block_q4_K * x = (const block_q4_K *)vx + ib0;
687
+
688
+ float tmp = 0; // partial sum for thread in warp
612
689
 
613
- const int ip = iqs / 128; // 0 or 1
614
- const int il = (iqs - 128*ip)/8; // 0...15
615
- const int is = 8*ip;
690
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
616
691
 
617
- const float * y = yy + 128*ip + il;
692
+ const uint8_t * q1 = x[i].qs + q_offset;
693
+ const uint8_t * q2 = q1 + 64;
694
+ const float * y1 = yy + i*QK_K + y_offset;
695
+ const float * y2 = y1 + 128;
618
696
 
619
- const float d = x[ib].d;
697
+ const float dall = x[i].d;
698
+ const float dmin = x[i].dmin;
620
699
 
621
- const uint8_t * ql = x[ib].ql + 64*ip + il;
622
- const uint8_t * qh = x[ib].qh + 32*ip + il;
623
- const int8_t * sc = x[ib].scales + is;
700
+ const uint16_t * a = (const uint16_t *)x[i].scales;
701
+ aux[0] = a[im+0] & kmask1;
702
+ aux[1] = a[im+2] & kmask1;
703
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
704
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
624
705
 
625
- result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
626
- + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
627
- + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
628
- + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
629
- + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
630
- + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
631
- + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
632
- + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
706
+ float4 s = {0.f, 0.f, 0.f, 0.f};
707
+ float smin = 0;
708
+ for (int l = 0; l < n; ++l) {
709
+ s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
710
+ s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
711
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
712
+ }
713
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
714
+
715
+ }
716
+
717
+ // sum up partial sums and write back result
718
+ __syncthreads();
719
+ #pragma unroll
720
+ for (int mask = 16; mask > 0; mask >>= 1) {
721
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
722
+ }
723
+
724
+ if (tid == 0) {
725
+ dst[row] = tmp;
726
+ }
727
+ }
728
+
729
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
730
+
731
+ const uint16_t kmask1 = 0x3f3f;
732
+ const uint16_t kmask2 = 0x0f0f;
733
+ const uint16_t kmask3 = 0xc0c0;
734
+
735
+ //const int row = blockIdx.x*blockDim.y + threadIdx.y;
736
+ const int row = blockIdx.x;
737
+ const int num_blocks_per_row = ncols / QK_K;
738
+ const int ib0 = row*num_blocks_per_row;
739
+
740
+ const int tid = threadIdx.x/2; // 0...15
741
+ const int ix = threadIdx.x%2;
742
+
743
+ const int il = tid/4; // 0...3
744
+ const int ir = tid - 4*il;// 0...3
745
+ const int n = 2;
746
+
747
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
748
+ const int in = il%2;
749
+
750
+ const int l0 = n*(2*ir + in);
751
+ const int q_offset = 32*im + l0;
752
+ const int y_offset = 64*im + l0;
753
+
754
+ const uint8_t hm1 = 1 << (2*im);
755
+ const uint8_t hm2 = hm1 << 4;
756
+
757
+ uint16_t aux[4];
758
+ const uint8_t * sc = (const uint8_t *)aux;
759
+
760
+ const block_q5_K * x = (const block_q5_K *)vx + ib0;
761
+
762
+ float tmp = 0; // partial sum for thread in warp
763
+
764
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
765
+
766
+ const uint8_t * ql1 = x[i].qs + q_offset;
767
+ const uint8_t * ql2 = ql1 + 64;
768
+ const uint8_t * qh = x[i].qh + l0;
769
+ const float * y1 = yy + i*QK_K + y_offset;
770
+ const float * y2 = y1 + 128;
771
+
772
+ const float dall = x[i].d;
773
+ const float dmin = x[i].dmin;
774
+
775
+ const uint16_t * a = (const uint16_t *)x[i].scales;
776
+ aux[0] = a[im+0] & kmask1;
777
+ aux[1] = a[im+2] & kmask1;
778
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
779
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
780
+
781
+ float4 sum = {0.f, 0.f, 0.f, 0.f};
782
+ float smin = 0;
783
+ for (int l = 0; l < n; ++l) {
784
+ sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
785
+ + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
786
+ sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
787
+ + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
788
+ sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
789
+ + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
790
+ sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
791
+ + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
792
+ smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
793
+ + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
794
+ }
795
+ tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
796
+
797
+ }
798
+
799
+ // sum up partial sums and write back result
800
+ __syncthreads();
801
+ #pragma unroll
802
+ for (int mask = 16; mask > 0; mask >>= 1) {
803
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
804
+ }
805
+
806
+ if (tid == 0) {
807
+ dst[row] = tmp;
808
+ }
809
+ }
810
+
811
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
812
+
813
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
814
+
815
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
816
+ if (row > nrows) return;
817
+
818
+ const int num_blocks_per_row = ncols / QK_K;
819
+ const int ib0 = row*num_blocks_per_row;
820
+
821
+ const block_q6_K * x = (const block_q6_K *)vx + ib0;
822
+
823
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
824
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
825
+
826
+ const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
827
+
828
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
829
+ const int in = tid - step*im; // 0...15 or 0...7
830
+
831
+ #if K_QUANTS_PER_ITERATION == 1
832
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
833
+ const int is = 0;
834
+ #else
835
+ const int l0 = 4 * in; // 0, 4, 8, ..., 28
836
+ const int is = in / 4;
837
+ #endif
838
+ const int ql_offset = 64*im + l0;
839
+ const int qh_offset = 32*im + l0;
840
+ const int s_offset = 8*im + is;
841
+ const int y_offset = 128*im + l0;
842
+
843
+ float tmp = 0; // partial sum for thread in warp
844
+
845
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
846
+
847
+ const float * y = yy + i * QK_K + y_offset;
848
+ const uint8_t * ql = x[i].ql + ql_offset;
849
+ const uint8_t * qh = x[i].qh + qh_offset;
850
+ const int8_t * s = x[i].scales + s_offset;
851
+
852
+ const float d = x[i].d;
853
+
854
+ #if K_QUANTS_PER_ITERATION == 1
855
+ float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
856
+ + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
857
+ + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
858
+ + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
859
+ + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
860
+ + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
861
+ + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
862
+ +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
863
+ tmp += sum;
864
+ #else
865
+ float sum = 0;
866
+ for (int l = 0; l < 4; ++l) {
867
+ sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
868
+ + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
869
+ + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
870
+ + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
871
+ }
872
+ tmp += sum;
873
+ #endif
874
+
875
+ }
876
+
877
+ // sum up partial sums and write back result
878
+ __syncthreads();
879
+ #pragma unroll
880
+ for (int mask = 16; mask > 0; mask >>= 1) {
881
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
882
+ }
633
883
 
884
+ if (tid == 0) {
885
+ dst[row] = tmp;
886
+ }
634
887
  }
635
888
 
636
- static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
889
+ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
637
890
  const half * x = (const half *) vx;
638
891
 
639
- v0 = __half2float(x[ib + iqs + 0]);
640
- v1 = __half2float(x[ib + iqs + 1]);
892
+ // automatic half -> float type cast if dfloat == float
893
+ v.x = x[ib + iqs + 0];
894
+ v.y = x[ib + iqs + 1];
641
895
  }
642
896
 
643
897
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -654,13 +908,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
654
908
  const int y_offset = qr == 1 ? 1 : qk/2;
655
909
 
656
910
  // dequantize
657
- float & v0 = y[iybs + iqs + 0];
658
- float & v1 = y[iybs + iqs + y_offset];
659
- dequantize_kernel(vx, ib, iqs, v0, v1);
911
+ dfloat2 v;
912
+ dequantize_kernel(vx, ib, iqs, v);
913
+
914
+ y[iybs + iqs + 0] = v.x;
915
+ y[iybs + iqs + y_offset] = v.y;
660
916
  }
661
917
 
662
918
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
663
- static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
919
+ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
664
920
  // qk = quantized weights per x block
665
921
  // qr = number of quantized weights per data value in x block
666
922
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -675,7 +931,12 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
675
931
  const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
676
932
  const int y_offset = qr == 1 ? 1 : qk/2;
677
933
 
678
- float tmp = 0.0f; // partial sum for thread in warp
934
+ // partial sum for each thread
935
+ #ifdef GGML_CUDA_DMMV_F16
936
+ half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
937
+ #else
938
+ float tmp = 0.0f;
939
+ #endif // GGML_CUDA_DMMV_F16
679
940
 
680
941
  for (int i = 0; i < ncols; i += iter_stride) {
681
942
  const int col = i + vals_per_iter*tid;
@@ -689,14 +950,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
689
950
  // process 2 vals per j iter
690
951
 
691
952
  // dequantize
692
- float v0, v1;
693
- dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
694
953
  // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
954
+ dfloat2 v;
955
+ dequantize_kernel(vx, ib, iqs + j/qr, v);
695
956
 
696
957
  // matrix multiplication
697
- tmp += v0 * y[iybs + iqs + j/qr + 0];
698
- tmp += v1 * y[iybs + iqs + j/qr + y_offset];
699
958
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
959
+ #ifdef GGML_CUDA_DMMV_F16
960
+ tmp += __hmul2(v, {
961
+ y[iybs + iqs + j/qr + 0],
962
+ y[iybs + iqs + j/qr + y_offset]
963
+ });
964
+ #else
965
+ tmp += v.x * y[iybs + iqs + j/qr + 0];
966
+ tmp += v.y * y[iybs + iqs + j/qr + y_offset];
967
+ #endif // GGML_CUDA_DMMV_F16
700
968
  }
701
969
  }
702
970
 
@@ -708,47 +976,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
708
976
  }
709
977
 
710
978
  if (tid == 0) {
979
+ #ifdef GGML_CUDA_DMMV_F16
980
+ dst[row] = tmp.x + tmp.y;
981
+ #else
711
982
  dst[row] = tmp;
712
- }
713
- }
714
-
715
- template <int n_thread, dot_kernel_k_t dot_kernel>
716
- static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
717
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
718
-
719
- if (row >= nrows) {
720
- return;
721
- }
722
-
723
- const int tid = threadIdx.x;
724
-
725
- const int iter_stride = QK_K;
726
- const int vals_per_iter = iter_stride / n_thread;
727
- const int num_blocks_per_row = ncols / QK_K;
728
- const int ib0 = row*num_blocks_per_row;
729
-
730
- float tmp = 0; // partial sum for thread in warp
731
-
732
- for (int i = 0; i < ncols; i += iter_stride) {
733
- const int col = i + vals_per_iter*tid;
734
- const int ib = ib0 + col/QK_K; // x block index
735
- const int iqs = col%QK_K; // x quant index
736
- const int iybs = col - col%QK_K; // y block start index
737
-
738
- float v;
739
- dot_kernel(vx, ib, iqs, y + iybs, v);
740
- tmp += v;
741
- }
742
-
743
- // sum up partial sums and write back result
744
- __syncthreads();
745
- #pragma unroll
746
- for (int mask = 16; mask > 0; mask >>= 1) {
747
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
748
- }
749
-
750
- if (tid == 0) {
751
- dst[row] = tmp;
983
+ #endif // GGML_CUDA_DMMV_F16
752
984
  }
753
985
  }
754
986
 
@@ -1043,7 +1275,7 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
1043
1275
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
1044
1276
  }
1045
1277
 
1046
- static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1278
+ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1047
1279
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1048
1280
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1049
1281
  const dim3 block_nums(1, block_num_y, 1);
@@ -1052,7 +1284,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, f
1052
1284
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1053
1285
  }
1054
1286
 
1055
- static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1287
+ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1056
1288
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1057
1289
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1058
1290
  const dim3 block_nums(1, block_num_y, 1);
@@ -1061,7 +1293,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, f
1061
1293
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1062
1294
  }
1063
1295
 
1064
- static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1296
+ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1065
1297
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1066
1298
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1067
1299
  const dim3 block_nums(1, block_num_y, 1);
@@ -1070,7 +1302,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, f
1070
1302
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1071
1303
  }
1072
1304
 
1073
- static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1305
+ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1074
1306
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1075
1307
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1076
1308
  const dim3 block_nums(1, block_num_y, 1);
@@ -1079,7 +1311,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, f
1079
1311
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1080
1312
  }
1081
1313
 
1082
- static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1314
+ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1083
1315
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1084
1316
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1085
1317
  const dim3 block_nums(1, block_num_y, 1);
@@ -1090,47 +1322,44 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
1090
1322
 
1091
1323
  static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1092
1324
  GGML_ASSERT(ncols % QK_K == 0);
1093
- const int ny = 2;
1325
+ const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
1094
1326
  const int block_num_y = (nrows + ny - 1) / ny;
1095
1327
  const dim3 block_nums(1, block_num_y, 1);
1096
1328
  const dim3 block_dims(32, ny, 1);
1097
- dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1329
+ dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1098
1330
  }
1099
1331
 
1100
1332
  static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1101
1333
  GGML_ASSERT(ncols % QK_K == 0);
1102
- const int ny = 2;
1334
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1103
1335
  const int block_num_y = (nrows + ny - 1) / ny;
1104
1336
  const dim3 block_nums(1, block_num_y, 1);
1105
1337
  const dim3 block_dims(32, ny, 1);
1106
- dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1338
+ dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1107
1339
  }
1108
1340
 
1109
1341
  static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1110
1342
  GGML_ASSERT(ncols % QK_K == 0);
1111
- const int ny = 2;
1343
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1112
1344
  const int block_num_y = (nrows + ny - 1) / ny;
1113
1345
  const dim3 block_nums(1, block_num_y, 1);
1114
1346
  const dim3 block_dims(32, ny, 1);
1115
- dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1347
+ dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1116
1348
  }
1117
1349
 
1118
1350
  static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1119
1351
  GGML_ASSERT(ncols % QK_K == 0);
1120
- const int ny = 2;
1121
- const int block_num_y = (nrows + ny - 1) / ny;
1122
- const dim3 block_nums(1, block_num_y, 1);
1123
- const dim3 block_dims(32, ny, 1);
1124
- dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1352
+ const dim3 block_dims(32, 1, 1);
1353
+ dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
1125
1354
  }
1126
1355
 
1127
1356
  static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1128
1357
  GGML_ASSERT(ncols % QK_K == 0);
1129
- const int ny = 2;
1358
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1130
1359
  const int block_num_y = (nrows + ny - 1) / ny;
1131
1360
  const dim3 block_nums(1, block_num_y, 1);
1132
1361
  const dim3 block_dims(32, ny, 1);
1133
- dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1362
+ dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1134
1363
  }
1135
1364
 
1136
1365
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -1138,7 +1367,7 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
1138
1367
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
1139
1368
  }
1140
1369
 
1141
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1370
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1142
1371
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1143
1372
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1144
1373
  const dim3 block_nums(1, block_num_y, 1);
@@ -1306,19 +1535,13 @@ static void * g_scratch_buffer = nullptr;
1306
1535
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
1307
1536
  static size_t g_scratch_offset = 0;
1308
1537
 
1309
- #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1310
- #define GGML_CUDA_MAX_EVENTS 64
1311
-
1312
1538
  static int g_device_count = -1;
1313
1539
  static int g_main_device = 0;
1314
1540
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1315
1541
 
1316
1542
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
1317
1543
 
1318
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1319
-
1320
- static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1321
- static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1544
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
1322
1545
 
1323
1546
  void ggml_init_cublas() {
1324
1547
  static bool initialized = false;
@@ -1342,15 +1565,8 @@ void ggml_init_cublas() {
1342
1565
  for (int id = 0; id < g_device_count; ++id) {
1343
1566
  CUDA_CHECK(cudaSetDevice(id));
1344
1567
 
1345
- // create streams
1346
- for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
1347
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1348
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1349
- }
1350
- // create events
1351
- for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
1352
- CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1353
- }
1568
+ // create main stream
1569
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
1354
1570
 
1355
1571
  // create cublas handle
1356
1572
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -1566,21 +1782,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1566
1782
  const int64_t ne00 = src0->ne[0];
1567
1783
  const int64_t nrows = i01_high - i01_low;
1568
1784
 
1785
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
1786
+ #ifdef GGML_CUDA_DMMV_F16
1787
+ size_t ash;
1788
+ dfloat * src1_dfloat = nullptr; // dfloat == half
1789
+
1790
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
1791
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
1792
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
1793
+
1794
+ if (src1_convert_f16) {
1795
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
1796
+ ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
1797
+ ne00, 1, sizeof(float), 0, 0,
1798
+ ne00, 1, sizeof(half), 0, 0, cudaStream_main);
1799
+ }
1800
+ #else
1801
+ dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
1802
+ #endif // GGML_CUDA_DMMV_F16
1803
+
1569
1804
  switch (src0->type) {
1570
1805
  case GGML_TYPE_Q4_0:
1571
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1806
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1572
1807
  break;
1573
1808
  case GGML_TYPE_Q4_1:
1574
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1809
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1575
1810
  break;
1576
1811
  case GGML_TYPE_Q5_0:
1577
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1812
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1578
1813
  break;
1579
1814
  case GGML_TYPE_Q5_1:
1580
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1815
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1581
1816
  break;
1582
1817
  case GGML_TYPE_Q8_0:
1583
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1818
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1584
1819
  break;
1585
1820
  case GGML_TYPE_Q2_K:
1586
1821
  dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
@@ -1598,7 +1833,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1598
1833
  dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1599
1834
  break;
1600
1835
  case GGML_TYPE_F16:
1601
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1836
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1602
1837
  break;
1603
1838
  default:
1604
1839
  GGML_ASSERT(false);
@@ -1606,6 +1841,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1606
1841
  }
1607
1842
  CUDA_CHECK(cudaGetLastError());
1608
1843
 
1844
+ #ifdef GGML_CUDA_DMMV_F16
1845
+ if (src1_convert_f16) {
1846
+ ggml_cuda_pool_free(src1_dfloat, ash);
1847
+ }
1848
+ #endif // GGML_CUDA_DMMV_F16
1849
+
1609
1850
  (void) src1;
1610
1851
  (void) dst;
1611
1852
  (void) src0_ddf_i;
@@ -1817,6 +2058,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1817
2058
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
1818
2059
  size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
1819
2060
 
2061
+ // if multiple GPUs are used they need to wait for the main GPU to finish
2062
+ if (split && g_device_count > 1) {
2063
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2064
+ CUDA_CHECK(cudaDeviceSynchronize());
2065
+ }
2066
+
1820
2067
  for (int id = 0; id < g_device_count; ++id) {
1821
2068
  if (!split && id != g_main_device) {
1822
2069
  continue;
@@ -1915,9 +2162,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1915
2162
  }
1916
2163
  const int64_t i11 = i13*ne12 + i12;
1917
2164
 
1918
- cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
1919
- cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
1920
- cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2165
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
1921
2166
 
1922
2167
  // for split tensors the data begins at i0 == i0_offset_low
1923
2168
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
@@ -1945,14 +2190,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1945
2190
  if (src1->backend == GGML_BACKEND_CPU) {
1946
2191
  GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
1947
2192
  int64_t nrows1 = flatten_rows ? nrows0 : ne11;
1948
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1));
2193
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
1949
2194
  } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
1950
2195
  if (id != g_main_device) {
1951
2196
  GGML_ASSERT(!flatten_rows);
1952
2197
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
1953
2198
  src1_ddf_i_source += i11*src1_stride;
1954
2199
  CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
1955
- cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
2200
+ cudaMemcpyDeviceToDevice, cudaStream_main));
1956
2201
  }
1957
2202
  } else if (src1_on_device && !src1_is_contiguous) {
1958
2203
  GGML_ASSERT(!split);
@@ -1961,7 +2206,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1961
2206
  GGML_ASSERT(false);
1962
2207
  }
1963
2208
  }
1964
- CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
1965
2209
 
1966
2210
  if (!src0_on_device || !src0_is_contiguous) {
1967
2211
  if (src0_is_f32) {
@@ -1977,9 +2221,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1977
2221
  CUDA_CHECK(cudaGetLastError());
1978
2222
  }
1979
2223
 
1980
- // wait with main stream until src1 memcpy is done
1981
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
1982
-
1983
2224
  // do the computation
1984
2225
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
1985
2226
 
@@ -2017,8 +2258,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2017
2258
 
2018
2259
  // wait until each device is finished, then free their buffers
2019
2260
  for (int id = 0; id < g_device_count; ++id) {
2261
+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
2262
+ continue;
2263
+ }
2264
+
2020
2265
  CUDA_CHECK(cudaSetDevice(id));
2021
2266
  CUDA_CHECK(cudaDeviceSynchronize());
2267
+
2022
2268
  if (src0_asq[id] > 0) {
2023
2269
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
2024
2270
  }
@@ -2084,7 +2330,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2084
2330
  const int64_t ne02 = src0->ne[2];
2085
2331
 
2086
2332
  CUDA_CHECK(cudaSetDevice(g_main_device));
2087
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2333
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2088
2334
 
2089
2335
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2090
2336
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -2096,8 +2342,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2096
2342
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2097
2343
 
2098
2344
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2099
-
2100
- CUDA_CHECK(cudaDeviceSynchronize());
2101
2345
  }
2102
2346
 
2103
2347
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -2115,7 +2359,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2115
2359
  const int64_t nb02 = src0->nb[2];
2116
2360
 
2117
2361
  CUDA_CHECK(cudaSetDevice(g_main_device));
2118
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2362
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2119
2363
 
2120
2364
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2121
2365
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -2130,8 +2374,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2130
2374
  const int channel_stride_x = nb02 / sizeof(half);
2131
2375
 
2132
2376
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2133
-
2134
- CUDA_CHECK(cudaDeviceSynchronize());
2135
2377
  }
2136
2378
 
2137
2379
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2187,7 +2429,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2187
2429
  const int64_t nb12 = src1->nb[2];
2188
2430
 
2189
2431
  CUDA_CHECK(cudaSetDevice(g_main_device));
2190
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2432
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2191
2433
 
2192
2434
  const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2193
2435
  const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -2205,8 +2447,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2205
2447
  GGML_ASSERT(false);
2206
2448
  }
2207
2449
 
2208
- CUDA_CHECK(cudaDeviceSynchronize());
2209
-
2210
2450
  (void) dst;
2211
2451
  }
2212
2452