llama_cpp 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,6 @@
1
1
  #include <cstddef>
2
2
  #include <cstdint>
3
+ #include <limits>
3
4
  #include <stdint.h>
4
5
  #include <stdio.h>
5
6
  #include <atomic>
@@ -12,6 +13,10 @@
12
13
  #include "ggml-cuda.h"
13
14
  #include "ggml.h"
14
15
 
16
+ #if defined(_MSC_VER)
17
+ #pragma warning(disable: 4244 4267) // possible loss of data
18
+ #endif
19
+
15
20
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
16
21
 
17
22
  #define CUDA_CHECK(err) \
@@ -24,7 +29,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
24
29
  } \
25
30
  } while (0)
26
31
 
27
- #if CUDART_VERSION >= 12
32
+ #if CUDART_VERSION >= 12000
28
33
  #define CUBLAS_CHECK(err) \
29
34
  do { \
30
35
  cublasStatus_t err_ = (err); \
@@ -45,9 +50,18 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
45
50
  } while (0)
46
51
  #endif // CUDART_VERSION >= 11
47
52
 
48
- typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
53
+ #ifdef GGML_CUDA_DMMV_F16
54
+ typedef half dfloat; // dequantize float
55
+ typedef half2 dfloat2;
56
+ #else
57
+ typedef float dfloat; // dequantize float
58
+ typedef float2 dfloat2;
59
+ #endif //GGML_CUDA_DMMV_F16
60
+
61
+ typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
49
62
  typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
50
63
  typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
64
+ typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
51
65
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
52
66
  typedef void (*ggml_cuda_op_t)(
53
67
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
@@ -151,7 +165,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
151
165
  #define CUDA_ADD_BLOCK_SIZE 256
152
166
  #define CUDA_MUL_BLOCK_SIZE 256
153
167
  #define CUDA_SILU_BLOCK_SIZE 256
168
+ #define CUDA_CPY_BLOCK_SIZE 32
169
+ #define CUDA_SCALE_BLOCK_SIZE 256
154
170
  #define CUDA_ROPE_BLOCK_SIZE 256
171
+ #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
155
172
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
156
173
 
157
174
  // dmmv = dequantize_mul_mat_vec
@@ -162,6 +179,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
162
179
  #define GGML_CUDA_DMMV_Y 1
163
180
  #endif
164
181
 
182
+ #ifndef K_QUANTS_PER_ITERATION
183
+ #define K_QUANTS_PER_ITERATION 2
184
+ #else
185
+ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
186
+ #endif
187
+
165
188
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
166
189
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
167
190
 
@@ -219,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
219
242
  }
220
243
  }
221
244
 
222
- static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
245
+ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
223
246
  const block_q4_0 * x = (const block_q4_0 *) vx;
224
247
 
225
- const float d = x[ib].d;
248
+ const dfloat d = x[ib].d;
226
249
 
227
- const uint8_t vui = x[ib].qs[iqs];
250
+ const int vui = x[ib].qs[iqs];
228
251
 
229
- const int8_t vi0 = vui & 0xF;
230
- const int8_t vi1 = vui >> 4;
252
+ v.x = vui & 0xF;
253
+ v.y = vui >> 4;
231
254
 
232
- v0 = (vi0 - 8)*d;
233
- v1 = (vi1 - 8)*d;
255
+ #ifdef GGML_CUDA_DMMV_F16
256
+ v = __hsub2(v, {8.0f, 8.0f});
257
+ v = __hmul2(v, {d, d});
258
+ #else
259
+ v.x = (v.x - 8.0f) * d;
260
+ v.y = (v.y - 8.0f) * d;
261
+ #endif // GGML_CUDA_DMMV_F16
234
262
  }
235
263
 
236
- static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
264
+ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
237
265
  const block_q4_1 * x = (const block_q4_1 *) vx;
238
266
 
239
- const float d = x[ib].d;
240
- const float m = x[ib].m;
267
+ const dfloat d = x[ib].d;
268
+ const dfloat m = x[ib].m;
241
269
 
242
- const uint8_t vui = x[ib].qs[iqs];
270
+ const int vui = x[ib].qs[iqs];
243
271
 
244
- const int8_t vi0 = vui & 0xF;
245
- const int8_t vi1 = vui >> 4;
272
+ v.x = vui & 0xF;
273
+ v.y = vui >> 4;
246
274
 
247
- v0 = vi0*d + m;
248
- v1 = vi1*d + m;
275
+ #ifdef GGML_CUDA_DMMV_F16
276
+ v = __hmul2(v, {d, d});
277
+ v = __hadd2(v, {m, m});
278
+ #else
279
+ v.x = (v.x * d) + m;
280
+ v.y = (v.y * d) + m;
281
+ #endif // GGML_CUDA_DMMV_F16
249
282
  }
250
283
 
251
- static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
284
+ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
252
285
  const block_q5_0 * x = (const block_q5_0 *) vx;
253
286
 
254
- const float d = x[ib].d;
287
+ const dfloat d = x[ib].d;
255
288
 
256
289
  uint32_t qh;
257
290
  memcpy(&qh, x[ib].qh, sizeof(qh));
258
291
 
259
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
260
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
292
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
293
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
261
294
 
262
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
263
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
295
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
296
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
264
297
 
265
- v0 = x0*d;
266
- v1 = x1*d;
298
+ #ifdef GGML_CUDA_DMMV_F16
299
+ v = __hsub2(v, {16.0f, 16.0f});
300
+ v = __hmul2(v, {d, d});
301
+ #else
302
+ v.x = (v.x - 16.0f) * d;
303
+ v.y = (v.y - 16.0f) * d;
304
+ #endif // GGML_CUDA_DMMV_F16
267
305
  }
268
306
 
269
- static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
307
+ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
270
308
  const block_q5_1 * x = (const block_q5_1 *) vx;
271
309
 
272
- const float d = x[ib].d;
273
- const float m = x[ib].m;
310
+ const dfloat d = x[ib].d;
311
+ const dfloat m = x[ib].m;
274
312
 
275
313
  uint32_t qh;
276
314
  memcpy(&qh, x[ib].qh, sizeof(qh));
277
315
 
278
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
279
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
316
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
317
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
280
318
 
281
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
282
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
319
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
320
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
283
321
 
284
- v0 = x0*d + m;
285
- v1 = x1*d + m;
322
+ #ifdef GGML_CUDA_DMMV_F16
323
+ v = __hmul2(v, {d, d});
324
+ v = __hadd2(v, {m, m});
325
+ #else
326
+ v.x = (v.x * d) + m;
327
+ v.y = (v.y * d) + m;
328
+ #endif // GGML_CUDA_DMMV_F16
286
329
  }
287
330
 
288
- static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
331
+ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
289
332
  const block_q8_0 * x = (const block_q8_0 *) vx;
290
333
 
291
- const float d = x[ib].d;
334
+ const dfloat d = x[ib].d;
292
335
 
293
- const int8_t vi0 = x[ib].qs[iqs + 0];
294
- const int8_t vi1 = x[ib].qs[iqs + 1];
336
+ v.x = x[ib].qs[iqs + 0];
337
+ v.y = x[ib].qs[iqs + 1];
295
338
 
296
- v0 = vi0*d;
297
- v1 = vi1*d;
339
+ #ifdef GGML_CUDA_DMMV_F16
340
+ v = __hmul2(v, {d, d});
341
+ #else
342
+ v.x *= d;
343
+ v.y *= d;
344
+ #endif // GGML_CUDA_DMMV_F16
298
345
  }
299
346
 
300
347
  //================================== k-quants
@@ -321,37 +368,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
321
368
 
322
369
  }
323
370
 
324
- static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
325
-
326
- const block_q2_K * x = (const block_q2_K *) vx;
327
-
328
- // if n is 0, we want to do the lower 128, else the upper 128,
329
- // covering y[l+0], y[l+32], y[l+64], y[l+96] and
330
- // y[l+16], y[l+48], y[l+80], y[l+112]
331
- int n = iqs/128; // 0 or 1
332
- int r = iqs - 128*n; // 0...120 in steps of 8
333
- int l = r/8; // 0...15 in steps of 1
334
-
335
- const float * y = yy + 128*n + l;
336
- const uint8_t * q = x[ib].qs + 32*n + l;
337
- const uint8_t * s = x[ib].scales + 8*n;
338
-
339
- const float dall = x[ib].d;
340
- const float dmin = x[ib].dmin;
341
-
342
- float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
343
- + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
344
- + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
345
- + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
346
- + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
347
- + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
348
- + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
349
- + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
350
-
351
- result = sum;
352
-
353
- }
354
-
355
371
  static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
356
372
 
357
373
  int r = threadIdx.x/4;
@@ -383,51 +399,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
383
399
 
384
400
  }
385
401
 
386
- static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
387
-
388
- const block_q3_K * x = (const block_q3_K *) vx;
389
-
390
- const uint32_t kmask1 = 0x03030303;
391
- const uint32_t kmask2 = 0x0f0f0f0f;
392
-
393
- uint32_t aux[3];
394
- uint32_t utmp[4];
395
-
396
- // if n is 0, we want to do the lower 128, else the upper 128,
397
- // covering y[l+0], y[l+32], y[l+64], y[l+96] and
398
- // y[l+16], y[l+48], y[l+80], y[l+112]
399
- int n = iqs/128; // 0 or 1
400
- int r = iqs - 128*n; // 0...120 in steps of 8
401
- int l = r/8; // 0...15 in steps of 1
402
-
403
- const float * y = yy + 128*n + l;
404
- const uint8_t * q = x[ib].qs + 32*n + l;
405
- const uint8_t * hm = x[ib].hmask + l;
406
- const int8_t * s = (const int8_t *)utmp + 8*n;
407
-
408
- memcpy(aux, x[ib].scales, 12);
409
- utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
410
- utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
411
- utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
412
- utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
413
-
414
- const float dall = x[ib].d;
415
-
416
- const uint8_t m = 1 << (4*n);
417
-
418
- float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
419
- + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
420
- + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
421
- + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
422
- + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
423
- + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
424
- + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
425
- + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
426
-
427
- result = sum * dall;
428
-
429
- }
430
-
431
402
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
432
403
  if (j < 4) {
433
404
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -474,38 +445,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
474
445
  }
475
446
  }
476
447
 
477
- static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
478
-
479
- const block_q4_K * x = (const block_q4_K *) vx;
480
-
481
- // iqs is in 0...248 in steps of 8 =>
482
- const int j = iqs / 64; // j is in 0...3
483
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
484
- const int is = 2*j; // is is in 0...6 in steps of 2
485
-
486
- const float * y = yy + 64*j + ir;
487
- const uint8_t * q = x[ib].qs + 32*j + ir;
488
-
489
- const float dall = x[ib].d;
490
- const float dmin = x[ib].dmin;
491
-
492
- uint8_t sc, m;
493
- get_scale_min_k4(is + 0, x[ib].scales, sc, m);
494
- const float d1 = dall * sc;
495
- const float m1 = dmin * m;
496
- get_scale_min_k4(is + 1, x[ib].scales, sc, m);
497
- const float d2 = dall * sc;
498
- const float m2 = dmin * m;
499
-
500
- float sum = 0;
501
- for (int k = 0; k < 4; ++k) {
502
- sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
503
- sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
504
- }
505
- result = sum;
506
-
507
- }
508
-
509
448
  static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
510
449
  const block_q5_K * x = (const block_q5_K *) vx;
511
450
 
@@ -539,43 +478,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
539
478
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
540
479
  }
541
480
 
542
- static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
543
-
544
- const block_q5_K * x = (const block_q5_K *) vx;
545
-
546
- // iqs is in 0...248 in steps of 8 =>
547
- const int j = iqs / 64; // j is in 0...3
548
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
549
- const int is = 2*j; // is is in 0...6 in steps of 2
550
-
551
- const float * y = yy + 64*j + ir;
552
- const uint8_t * ql = x[ib].qs + 32*j + ir;
553
- const uint8_t * qh = x[ib].qh + ir;
554
-
555
- const float dall = x[ib].d;
556
- const float dmin = x[ib].dmin;
557
-
558
- uint8_t sc, m;
559
- get_scale_min_k4(is + 0, x[ib].scales, sc, m);
560
- const float d1 = dall * sc;
561
- const float m1 = dmin * m;
562
- get_scale_min_k4(is + 1, x[ib].scales, sc, m);
563
- const float d2 = dall * sc;
564
- const float m2 = dmin * m;
565
-
566
- uint8_t hm = 1 << is;
567
- float sum = 0;
568
- for (int k = 0; k < 4; ++k) {
569
- sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
570
- }
571
- hm <<= 1;
572
- for (int k = 0; k < 4; ++k) {
573
- sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
574
- }
575
- result = sum;
576
-
577
- }
578
-
579
481
  static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
580
482
  const block_q6_K * x = (const block_q6_K *) vx;
581
483
 
@@ -601,38 +503,395 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
601
503
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
602
504
  }
603
505
 
604
- static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
506
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
605
507
 
606
- const block_q6_K * x = (const block_q6_K *) vx;
508
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
509
+
510
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
511
+ if (row > nrows) return;
512
+
513
+ const int num_blocks_per_row = ncols / QK_K;
514
+ const int ib0 = row*num_blocks_per_row;
515
+
516
+ const block_q2_K * x = (const block_q2_K *)vx + ib0;
517
+
518
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
519
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
520
+
521
+ const int step = 16/K_QUANTS_PER_ITERATION;
522
+
523
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
524
+ const int in = tid - step*im; // 0...15 or 0...7
525
+
526
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
527
+ const int q_offset = 32*im + l0;
528
+ const int s_offset = 8*im;
529
+ const int y_offset = 128*im + l0;
530
+
531
+ float tmp = 0; // partial sum for thread in warp
532
+
533
+ uint32_t aux[4];
534
+ const uint8_t * d = (const uint8_t *)aux;
535
+ const uint8_t * m = (const uint8_t *)(aux + 2);
536
+
537
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
538
+
539
+ const float * y = yy + i * QK_K + y_offset;
540
+ const uint8_t * q = x[i].qs + q_offset;
541
+
542
+ const float dall = x[i].d;
543
+ const float dmin = x[i].dmin;
544
+
545
+ const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
546
+ aux[0] = a[0] & 0x0f0f0f0f;
547
+ aux[1] = a[1] & 0x0f0f0f0f;
548
+ aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
549
+ aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
550
+
551
+ float sum1 = 0, sum2 = 0;
552
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
553
+ sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
554
+ + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
555
+ + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
556
+ + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
557
+ + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
558
+ + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
559
+ + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
560
+ +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
561
+ sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
562
+ + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
563
+
564
+ }
565
+ tmp += dall * sum1 - dmin * sum2;
566
+
567
+ }
568
+
569
+ // sum up partial sums and write back result
570
+ __syncthreads();
571
+ #pragma unroll
572
+ for (int mask = 16; mask > 0; mask >>= 1) {
573
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
574
+ }
575
+
576
+ if (tid == 0) {
577
+ dst[row] = tmp;
578
+ }
579
+ }
580
+
581
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
582
+
583
+ const uint16_t kmask1 = 0x0303;
584
+ const uint16_t kmask2 = 0x0f0f;
585
+
586
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
587
+ if (row > nrows) return;
588
+
589
+ const int num_blocks_per_row = ncols / QK_K;
590
+ const int ib0 = row*num_blocks_per_row;
591
+
592
+ const block_q3_K * x = (const block_q3_K *)vx + ib0;
593
+
594
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
595
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
596
+
597
+ const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
598
+ const int step = 16/K_QUANTS_PER_ITERATION;
599
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
600
+ const int in = tid - step*im; // 0....15 or 0...7
601
+
602
+ const uint8_t m = 1 << (4*im);
603
+
604
+ const int l0 = n*in; // 0...15 or 0...14 in steps of 2
605
+ const int q_offset = 32*im + l0;
606
+ const int y_offset = 128*im + l0;
607
+
608
+ uint16_t utmp[4];
609
+ const int8_t * s = (const int8_t *)utmp;
610
+
611
+ const uint16_t s_shift = 4*im;
612
+
613
+ float tmp = 0; // partial sum for thread in warp
614
+
615
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
616
+
617
+ const float * y = yy + i * QK_K + y_offset;
618
+ const uint8_t * q = x[i].qs + q_offset;
619
+ const uint8_t * h = x[i].hmask + l0;
620
+
621
+ const uint16_t * a = (const uint16_t *)x[i].scales;
622
+ utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
623
+ utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
624
+ utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
625
+ utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
626
+
627
+ const float d = x[i].d;
628
+
629
+ float sum = 0;
630
+ for (int l = 0; l < n; ++l) {
631
+ sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
632
+ + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
633
+ + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
634
+ + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
635
+ sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
636
+ + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
637
+ + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
638
+ + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
639
+ }
640
+ tmp += d * sum;
641
+
642
+ }
643
+
644
+ // sum up partial sums and write back result
645
+ __syncthreads();
646
+ #pragma unroll
647
+ for (int mask = 16; mask > 0; mask >>= 1) {
648
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
649
+ }
650
+
651
+ if (tid == 0) {
652
+ dst[row] = tmp;
653
+ }
654
+ }
655
+
656
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
657
+
658
+ const uint16_t kmask1 = 0x3f3f;
659
+ const uint16_t kmask2 = 0x0f0f;
660
+ const uint16_t kmask3 = 0xc0c0;
661
+
662
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
663
+ if (row > nrows) return;
664
+ const int num_blocks_per_row = ncols / QK_K;
665
+ const int ib0 = row*num_blocks_per_row;
666
+
667
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
668
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
669
+
670
+ const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
671
+
672
+ const int il = tid/step; // 0...3
673
+ const int ir = tid - step*il; // 0...7 or 0...3
674
+ const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
675
+
676
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
677
+ const int in = il%2;
678
+
679
+ const int l0 = n*(2*ir + in);
680
+ const int q_offset = 32*im + l0;
681
+ const int y_offset = 64*im + l0;
682
+
683
+ uint16_t aux[4];
684
+ const uint8_t * sc = (const uint8_t *)aux;
685
+
686
+ const block_q4_K * x = (const block_q4_K *)vx + ib0;
687
+
688
+ float tmp = 0; // partial sum for thread in warp
689
+
690
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
691
+
692
+ const uint8_t * q1 = x[i].qs + q_offset;
693
+ const uint8_t * q2 = q1 + 64;
694
+ const float * y1 = yy + i*QK_K + y_offset;
695
+ const float * y2 = y1 + 128;
696
+
697
+ const float dall = x[i].d;
698
+ const float dmin = x[i].dmin;
699
+
700
+ const uint16_t * a = (const uint16_t *)x[i].scales;
701
+ aux[0] = a[im+0] & kmask1;
702
+ aux[1] = a[im+2] & kmask1;
703
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
704
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
705
+
706
+ float4 s = {0.f, 0.f, 0.f, 0.f};
707
+ float smin = 0;
708
+ for (int l = 0; l < n; ++l) {
709
+ s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
710
+ s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
711
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
712
+ }
713
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
714
+
715
+ }
716
+
717
+ // sum up partial sums and write back result
718
+ __syncthreads();
719
+ #pragma unroll
720
+ for (int mask = 16; mask > 0; mask >>= 1) {
721
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
722
+ }
723
+
724
+ if (tid == 0) {
725
+ dst[row] = tmp;
726
+ }
727
+ }
728
+
729
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
730
+
731
+ const uint16_t kmask1 = 0x3f3f;
732
+ const uint16_t kmask2 = 0x0f0f;
733
+ const uint16_t kmask3 = 0xc0c0;
734
+
735
+ //const int row = blockIdx.x*blockDim.y + threadIdx.y;
736
+ const int row = blockIdx.x;
737
+ const int num_blocks_per_row = ncols / QK_K;
738
+ const int ib0 = row*num_blocks_per_row;
739
+
740
+ const int tid = threadIdx.x/2; // 0...15
741
+ const int ix = threadIdx.x%2;
742
+
743
+ const int il = tid/4; // 0...3
744
+ const int ir = tid - 4*il;// 0...3
745
+ const int n = 2;
607
746
 
608
- const int ip = iqs / 128; // 0 or 1
609
- const int il = (iqs - 128*ip)/8; // 0...15
610
- const int is = 8*ip;
747
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
748
+ const int in = il%2;
611
749
 
612
- const float * y = yy + 128*ip + il;
750
+ const int l0 = n*(2*ir + in);
751
+ const int q_offset = 32*im + l0;
752
+ const int y_offset = 64*im + l0;
613
753
 
614
- const float d = x[ib].d;
754
+ const uint8_t hm1 = 1 << (2*im);
755
+ const uint8_t hm2 = hm1 << 4;
615
756
 
616
- const uint8_t * ql = x[ib].ql + 64*ip + il;
617
- const uint8_t * qh = x[ib].qh + 32*ip + il;
618
- const int8_t * sc = x[ib].scales + is;
757
+ uint16_t aux[4];
758
+ const uint8_t * sc = (const uint8_t *)aux;
619
759
 
620
- result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
621
- + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
622
- + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
623
- + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
624
- + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
625
- + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
626
- + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
627
- + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
760
+ const block_q5_K * x = (const block_q5_K *)vx + ib0;
628
761
 
762
+ float tmp = 0; // partial sum for thread in warp
763
+
764
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
765
+
766
+ const uint8_t * ql1 = x[i].qs + q_offset;
767
+ const uint8_t * ql2 = ql1 + 64;
768
+ const uint8_t * qh = x[i].qh + l0;
769
+ const float * y1 = yy + i*QK_K + y_offset;
770
+ const float * y2 = y1 + 128;
771
+
772
+ const float dall = x[i].d;
773
+ const float dmin = x[i].dmin;
774
+
775
+ const uint16_t * a = (const uint16_t *)x[i].scales;
776
+ aux[0] = a[im+0] & kmask1;
777
+ aux[1] = a[im+2] & kmask1;
778
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
779
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
780
+
781
+ float4 sum = {0.f, 0.f, 0.f, 0.f};
782
+ float smin = 0;
783
+ for (int l = 0; l < n; ++l) {
784
+ sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
785
+ + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
786
+ sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
787
+ + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
788
+ sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
789
+ + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
790
+ sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
791
+ + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
792
+ smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
793
+ + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
794
+ }
795
+ tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
796
+
797
+ }
798
+
799
+ // sum up partial sums and write back result
800
+ __syncthreads();
801
+ #pragma unroll
802
+ for (int mask = 16; mask > 0; mask >>= 1) {
803
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
804
+ }
805
+
806
+ if (tid == 0) {
807
+ dst[row] = tmp;
808
+ }
629
809
  }
630
810
 
631
- static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
811
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
812
+
813
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
814
+
815
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
816
+ if (row > nrows) return;
817
+
818
+ const int num_blocks_per_row = ncols / QK_K;
819
+ const int ib0 = row*num_blocks_per_row;
820
+
821
+ const block_q6_K * x = (const block_q6_K *)vx + ib0;
822
+
823
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
824
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
825
+
826
+ const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
827
+
828
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
829
+ const int in = tid - step*im; // 0...15 or 0...7
830
+
831
+ #if K_QUANTS_PER_ITERATION == 1
832
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
833
+ const int is = 0;
834
+ #else
835
+ const int l0 = 4 * in; // 0, 4, 8, ..., 28
836
+ const int is = in / 4;
837
+ #endif
838
+ const int ql_offset = 64*im + l0;
839
+ const int qh_offset = 32*im + l0;
840
+ const int s_offset = 8*im + is;
841
+ const int y_offset = 128*im + l0;
842
+
843
+ float tmp = 0; // partial sum for thread in warp
844
+
845
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
846
+
847
+ const float * y = yy + i * QK_K + y_offset;
848
+ const uint8_t * ql = x[i].ql + ql_offset;
849
+ const uint8_t * qh = x[i].qh + qh_offset;
850
+ const int8_t * s = x[i].scales + s_offset;
851
+
852
+ const float d = x[i].d;
853
+
854
+ #if K_QUANTS_PER_ITERATION == 1
855
+ float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
856
+ + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
857
+ + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
858
+ + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
859
+ + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
860
+ + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
861
+ + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
862
+ +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
863
+ tmp += sum;
864
+ #else
865
+ float sum = 0;
866
+ for (int l = 0; l < 4; ++l) {
867
+ sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
868
+ + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
869
+ + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
870
+ + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
871
+ }
872
+ tmp += sum;
873
+ #endif
874
+
875
+ }
876
+
877
+ // sum up partial sums and write back result
878
+ __syncthreads();
879
+ #pragma unroll
880
+ for (int mask = 16; mask > 0; mask >>= 1) {
881
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
882
+ }
883
+
884
+ if (tid == 0) {
885
+ dst[row] = tmp;
886
+ }
887
+ }
888
+
889
+ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
632
890
  const half * x = (const half *) vx;
633
891
 
634
- v0 = __half2float(x[ib + iqs + 0]);
635
- v1 = __half2float(x[ib + iqs + 1]);
892
+ // automatic half -> float type cast if dfloat == float
893
+ v.x = x[ib + iqs + 0];
894
+ v.y = x[ib + iqs + 1];
636
895
  }
637
896
 
638
897
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -649,23 +908,35 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
649
908
  const int y_offset = qr == 1 ? 1 : qk/2;
650
909
 
651
910
  // dequantize
652
- float & v0 = y[iybs + iqs + 0];
653
- float & v1 = y[iybs + iqs + y_offset];
654
- dequantize_kernel(vx, ib, iqs, v0, v1);
911
+ dfloat2 v;
912
+ dequantize_kernel(vx, ib, iqs, v);
913
+
914
+ y[iybs + iqs + 0] = v.x;
915
+ y[iybs + iqs + y_offset] = v.y;
655
916
  }
656
917
 
657
918
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
658
- static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
919
+ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
659
920
  // qk = quantized weights per x block
660
921
  // qr = number of quantized weights per data value in x block
661
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
922
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
923
+
924
+ if (row >= nrows) {
925
+ return;
926
+ }
927
+
662
928
  const int tid = threadIdx.x;
663
929
 
664
930
  const int iter_stride = 2*GGML_CUDA_DMMV_X;
665
931
  const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
666
932
  const int y_offset = qr == 1 ? 1 : qk/2;
667
933
 
668
- float tmp = 0.0f; // partial sum for thread in warp
934
+ // partial sum for each thread
935
+ #ifdef GGML_CUDA_DMMV_F16
936
+ half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
937
+ #else
938
+ float tmp = 0.0f;
939
+ #endif // GGML_CUDA_DMMV_F16
669
940
 
670
941
  for (int i = 0; i < ncols; i += iter_stride) {
671
942
  const int col = i + vals_per_iter*tid;
@@ -679,14 +950,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
679
950
  // process 2 vals per j iter
680
951
 
681
952
  // dequantize
682
- float v0, v1;
683
- dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
684
953
  // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
954
+ dfloat2 v;
955
+ dequantize_kernel(vx, ib, iqs + j/qr, v);
685
956
 
686
957
  // matrix multiplication
687
- tmp += v0 * y[iybs + iqs + j/qr + 0];
688
- tmp += v1 * y[iybs + iqs + j/qr + y_offset];
689
958
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
959
+ #ifdef GGML_CUDA_DMMV_F16
960
+ tmp += __hmul2(v, {
961
+ y[iybs + iqs + j/qr + 0],
962
+ y[iybs + iqs + j/qr + y_offset]
963
+ });
964
+ #else
965
+ tmp += v.x * y[iybs + iqs + j/qr + 0];
966
+ tmp += v.y * y[iybs + iqs + j/qr + y_offset];
967
+ #endif // GGML_CUDA_DMMV_F16
690
968
  }
691
969
  }
692
970
 
@@ -698,64 +976,232 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
698
976
  }
699
977
 
700
978
  if (tid == 0) {
979
+ #ifdef GGML_CUDA_DMMV_F16
980
+ dst[row] = tmp.x + tmp.y;
981
+ #else
701
982
  dst[row] = tmp;
983
+ #endif // GGML_CUDA_DMMV_F16
702
984
  }
703
985
  }
704
986
 
705
- template <int n_thread, dot_kernel_k_t dot_kernel>
706
- static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols) {
707
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
708
- const int tid = threadIdx.x;
987
+ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
988
+ const half * x = (half *) vx;
709
989
 
710
- const int iter_stride = QK_K;
711
- const int vals_per_iter = iter_stride / n_thread;
712
- const int num_blocks_per_row = ncols / QK_K;
713
- const int ib0 = row*num_blocks_per_row;
990
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
991
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
714
992
 
715
- float tmp = 0; // partial sum for thread in warp
993
+ const int nrows_y = ncols_x;
994
+ const int nrows_dst = nrows_x;
995
+ const int row_dst = row_x;
716
996
 
717
- for (int i = 0; i < ncols; i += iter_stride) {
718
- const int col = i + vals_per_iter*tid;
719
- const int ib = ib0 + col/QK_K; // x block index
720
- const int iqs = col%QK_K; // x quant index
721
- const int iybs = col - col%QK_K; // y block start index
997
+ float tmp = 0.0f;
998
+
999
+ for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
1000
+ const int col_x = col_x0 + threadIdx.x;
1001
+
1002
+ if (col_x >= ncols_x) {
1003
+ break;
1004
+ }
1005
+
1006
+ // x is transposed and permuted
1007
+ const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
1008
+ const float xi = __half2float(x[ix]);
1009
+
1010
+ const int row_y = col_x;
1011
+
1012
+
1013
+ // y is not transposed but permuted
1014
+ const int iy = channel*nrows_y + row_y;
1015
+
1016
+ tmp += xi * y[iy];
1017
+ }
1018
+
1019
+ // dst is not transposed and not permuted
1020
+ const int idst = channel*nrows_dst + row_dst;
1021
+
1022
+ // sum up partial sums and write back result
1023
+ __syncthreads();
1024
+ #pragma unroll
1025
+ for (int mask = 16; mask > 0; mask >>= 1) {
1026
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1027
+ }
1028
+
1029
+ if (threadIdx.x == 0) {
1030
+ dst[idst] = tmp;
1031
+ }
1032
+ }
1033
+
1034
+ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1035
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1036
+ const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
1037
+
1038
+ const half * x = (half *) vx;
1039
+
1040
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1041
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1042
+
1043
+ const int nrows_y = ncols_x;
1044
+ const int nrows_dst = nrows_x;
1045
+ const int row_dst = row_x;
1046
+
1047
+ const int idst = channel*nrows_dst + row_dst;
1048
+
1049
+ float tmp = 0.0f;
1050
+
1051
+ for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
1052
+ const int col_x = col_x0 + threadIdx.x;
1053
+
1054
+ if (col_x >= ncols_x) {
1055
+ break;
1056
+ }
1057
+
1058
+ const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
1059
+ const float xi = __half2float(x[ix]);
1060
+
1061
+ const int row_y = col_x;
1062
+
1063
+ const int iy = channel*nrows_y + row_y;
1064
+
1065
+ tmp += xi * y[iy];
1066
+ }
1067
+
1068
+ // sum up partial sums and write back result
1069
+ __syncthreads();
1070
+ #pragma unroll
1071
+ for (int mask = 16; mask > 0; mask >>= 1) {
1072
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1073
+ }
1074
+
1075
+ if (threadIdx.x == 0) {
1076
+ dst[idst] = tmp;
1077
+ }
1078
+ }
1079
+
1080
+ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1081
+ const float * xi = (float *) cxi;
1082
+ float * dsti = (float *) cdsti;
1083
+
1084
+ *dsti = *xi;
1085
+ }
1086
+
1087
+ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1088
+ const float * xi = (float *) cxi;
1089
+ half * dsti = (half *) cdsti;
1090
+
1091
+ *dsti = __float2half(*xi);
1092
+ }
1093
+
1094
+ template <cpy_kernel_t cpy_1>
1095
+ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
1096
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1097
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
1098
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
1099
+
1100
+ if (i >= ne) {
1101
+ return;
1102
+ }
1103
+
1104
+ // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
1105
+ // then combine those indices with the corresponding byte offsets to get the total offsets
1106
+ const int i02 = i / (ne00*ne01);
1107
+ const int i01 = (i - i02*ne01*ne00) / ne00;
1108
+ const int i00 = i - i02*ne01*ne00 - i01*ne00;
1109
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
1110
+
1111
+ const int i12 = i / (ne10*ne11);
1112
+ const int i11 = (i - i12*ne10*ne11) / ne10;
1113
+ const int i10 = i - i12*ne10*ne11 - i11*ne10;
1114
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
1115
+
1116
+ cpy_1(cx + x_offset, cdst + dst_offset);
1117
+ }
1118
+
1119
+ // rope == RoPE == rotary positional embedding
1120
+ static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
1121
+ const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
1122
+
1123
+ if (col >= ncols) {
1124
+ return;
1125
+ }
1126
+
1127
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1128
+ const int i = row*ncols + col;
1129
+
1130
+ const float theta = p*powf(theta_scale, col/2);
1131
+ const float sin_theta = sinf(theta);
1132
+ const float cos_theta = cosf(theta);
1133
+
1134
+ const float x0 = x[i + 0];
1135
+ const float x1 = x[i + 1];
1136
+
1137
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
1138
+ dst[i + 1] = x0*sin_theta + x1*cos_theta;
1139
+ }
1140
+
1141
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
1142
+ const int col = blockDim.x*blockIdx.x + threadIdx.x;
1143
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1144
+
1145
+ if (col >= ncols) {
1146
+ return;
1147
+ }
1148
+
1149
+ const int i = row*ncols + col;
1150
+ // dst[i] = col > n_past + row ? -INFINITY : x[i];
1151
+ dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
1152
+ }
1153
+
1154
+ // the CUDA soft max implementation differs from the CPU implementation
1155
+ // instead of doubles floats are used
1156
+ // values are also not normalized to the maximum value by subtracting it in the exponential function
1157
+ // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
1158
+ static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
1159
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1160
+ const int block_size = blockDim.x;
1161
+ const int tid = threadIdx.x;
1162
+
1163
+ float tmp = 0.0;
1164
+
1165
+ for (int block_start = 0; block_start < ncols; block_start += block_size) {
1166
+ const int col = block_start + tid;
722
1167
 
723
- float v;
724
- dot_kernel(vx, ib, iqs, y + iybs, v);
725
- tmp += v;
1168
+ if (col >= ncols) {
1169
+ break;
1170
+ }
1171
+
1172
+ const int i = row*ncols + col;
1173
+ const float val = expf(x[i]);
1174
+ tmp += val;
1175
+ dst[i] = val;
726
1176
  }
727
1177
 
728
- // sum up partial sums and write back result
1178
+ // sum up partial sums
729
1179
  __syncthreads();
730
1180
  #pragma unroll
731
1181
  for (int mask = 16; mask > 0; mask >>= 1) {
732
1182
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
733
1183
  }
734
1184
 
735
- if (tid == 0) {
736
- dst[row] = tmp;
1185
+ for (int block_start = 0; block_start < ncols; block_start += block_size) {
1186
+ const int col = block_start + tid;
1187
+
1188
+ if (col >= ncols) {
1189
+ break;
1190
+ }
1191
+
1192
+ const int i = row*ncols + col;
1193
+ dst[i] /= tmp;
737
1194
  }
738
1195
  }
739
1196
 
740
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
741
- const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
1197
+ static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
1198
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
742
1199
 
743
- if (col >= ncols) {
1200
+ if (i >= k) {
744
1201
  return;
745
1202
  }
746
1203
 
747
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
748
- const int i = row*ncols + col;
749
-
750
- const float theta = p*powf(theta_scale, col/2);
751
- const float sin_theta = sinf(theta);
752
- const float cos_theta = cosf(theta);
753
-
754
- const float x0 = x[i + 0];
755
- const float x1 = x[i + 1];
756
-
757
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
758
- dst[i + 1] = x0*sin_theta + x1*cos_theta;
1204
+ dst[i] = scale * x[i];
759
1205
  }
760
1206
 
761
1207
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
@@ -829,75 +1275,91 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
829
1275
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
830
1276
  }
831
1277
 
832
- static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1278
+ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
833
1279
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
834
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1280
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1281
+ const dim3 block_nums(1, block_num_y, 1);
835
1282
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
836
1283
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
837
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1284
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
838
1285
  }
839
1286
 
840
- static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1287
+ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
841
1288
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
842
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1289
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1290
+ const dim3 block_nums(1, block_num_y, 1);
843
1291
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
844
1292
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
845
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1293
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
846
1294
  }
847
1295
 
848
- static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1296
+ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
849
1297
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
850
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1298
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1299
+ const dim3 block_nums(1, block_num_y, 1);
851
1300
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
852
1301
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
853
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1302
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
854
1303
  }
855
1304
 
856
- static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1305
+ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
857
1306
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
858
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1307
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1308
+ const dim3 block_nums(1, block_num_y, 1);
859
1309
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
860
1310
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
861
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1311
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
862
1312
  }
863
1313
 
864
- static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1314
+ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
865
1315
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
866
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1316
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1317
+ const dim3 block_nums(1, block_num_y, 1);
867
1318
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
868
1319
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
869
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1320
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
870
1321
  }
871
1322
 
872
1323
  static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
873
1324
  GGML_ASSERT(ncols % QK_K == 0);
874
- const int ny = 2;
1325
+ const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
1326
+ const int block_num_y = (nrows + ny - 1) / ny;
1327
+ const dim3 block_nums(1, block_num_y, 1);
875
1328
  const dim3 block_dims(32, ny, 1);
876
- dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols);
1329
+ dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
877
1330
  }
878
1331
 
879
1332
  static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
880
1333
  GGML_ASSERT(ncols % QK_K == 0);
881
- const dim3 block_dims(32, 2, 1);
882
- dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
1334
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1335
+ const int block_num_y = (nrows + ny - 1) / ny;
1336
+ const dim3 block_nums(1, block_num_y, 1);
1337
+ const dim3 block_dims(32, ny, 1);
1338
+ dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
883
1339
  }
884
1340
 
885
1341
  static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
886
1342
  GGML_ASSERT(ncols % QK_K == 0);
887
- const dim3 block_dims(32, 2, 1);
888
- dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
1343
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1344
+ const int block_num_y = (nrows + ny - 1) / ny;
1345
+ const dim3 block_nums(1, block_num_y, 1);
1346
+ const dim3 block_dims(32, ny, 1);
1347
+ dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
889
1348
  }
890
1349
 
891
1350
  static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
892
1351
  GGML_ASSERT(ncols % QK_K == 0);
893
- const dim3 block_dims(32, 2, 1);
894
- dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
1352
+ const dim3 block_dims(32, 1, 1);
1353
+ dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
895
1354
  }
896
1355
 
897
1356
  static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
898
1357
  GGML_ASSERT(ncols % QK_K == 0);
899
- const dim3 block_dims(32, 2, 1);
900
- dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
1358
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1359
+ const int block_num_y = (nrows + ny - 1) / ny;
1360
+ const dim3 block_nums(1, block_num_y, 1);
1361
+ const dim3 block_dims(32, ny, 1);
1362
+ dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
901
1363
  }
902
1364
 
903
1365
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -905,12 +1367,13 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
905
1367
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
906
1368
  }
907
1369
 
908
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1370
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
909
1371
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
910
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1372
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1373
+ const dim3 block_nums(1, block_num_y, 1);
911
1374
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
912
1375
  dequantize_mul_mat_vec<1, 1, convert_f16>
913
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1376
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
914
1377
  }
915
1378
 
916
1379
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
@@ -942,6 +1405,47 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
942
1405
  }
943
1406
  }
944
1407
 
1408
+ static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
1409
+ const dim3 block_nums(1, nrows_x, nchannels_x);
1410
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1411
+ mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
1412
+ }
1413
+
1414
+ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1415
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
1416
+ const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
1417
+
1418
+ const dim3 block_nums(1, nrows_x, nchannels_x);
1419
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1420
+ mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1421
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
1422
+ }
1423
+
1424
+ static void ggml_cpy_f32_f32_cuda(
1425
+ const char * cx, char * cdst, const int ne,
1426
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1427
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
1428
+
1429
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
1430
+ cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
1431
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
1432
+ }
1433
+
1434
+ static void ggml_cpy_f32_f16_cuda(
1435
+ const char * cx, char * cdst, const int ne,
1436
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1437
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
1438
+
1439
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
1440
+ cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
1441
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
1442
+ }
1443
+
1444
+ static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
1445
+ const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
1446
+ scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
1447
+ }
1448
+
945
1449
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
946
1450
  GGML_ASSERT(nrows % 2 == 0);
947
1451
  const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@@ -950,6 +1454,19 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
950
1454
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
951
1455
  }
952
1456
 
1457
+ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
1458
+ const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
1459
+ const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
1460
+ const dim3 block_nums(block_num_x, nrows_x, 1);
1461
+ diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
1462
+ }
1463
+
1464
+ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
1465
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1466
+ const dim3 block_nums(1, nrows_x, 1);
1467
+ soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
1468
+ }
1469
+
953
1470
  // buffer pool for cuda
954
1471
  #define MAX_CUDA_BUFFERS 256
955
1472
 
@@ -1018,19 +1535,13 @@ static void * g_scratch_buffer = nullptr;
1018
1535
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
1019
1536
  static size_t g_scratch_offset = 0;
1020
1537
 
1021
- #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1022
- #define GGML_CUDA_MAX_EVENTS 64
1023
-
1024
1538
  static int g_device_count = -1;
1025
1539
  static int g_main_device = 0;
1026
1540
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1027
1541
 
1028
1542
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
1029
1543
 
1030
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1031
-
1032
- static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1033
- static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1544
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
1034
1545
 
1035
1546
  void ggml_init_cublas() {
1036
1547
  static bool initialized = false;
@@ -1054,15 +1565,8 @@ void ggml_init_cublas() {
1054
1565
  for (int id = 0; id < g_device_count; ++id) {
1055
1566
  CUDA_CHECK(cudaSetDevice(id));
1056
1567
 
1057
- // create streams
1058
- for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
1059
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1060
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1061
- }
1062
- // create events
1063
- for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
1064
- CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1065
- }
1568
+ // create main stream
1569
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
1066
1570
 
1067
1571
  // create cublas handle
1068
1572
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -1105,6 +1609,9 @@ void * ggml_cuda_host_malloc(size_t size) {
1105
1609
  void * ptr = nullptr;
1106
1610
  cudaError_t err = cudaMallocHost((void **) &ptr, size);
1107
1611
  if (err != cudaSuccess) {
1612
+ // The allocation error can be bypassed. A null ptr will assigned out of this function.
1613
+ // This can fixed the OOM error in WSL.
1614
+ cudaGetLastError();
1108
1615
  fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
1109
1616
  size/1024.0/1024.0, cudaGetErrorString(err));
1110
1617
  return nullptr;
@@ -1117,10 +1624,25 @@ void ggml_cuda_host_free(void * ptr) {
1117
1624
  CUDA_CHECK(cudaFreeHost(ptr));
1118
1625
  }
1119
1626
 
1120
- static cudaError_t ggml_cuda_h2d_tensor_2d(
1627
+ static cudaError_t ggml_cuda_cpy_tensor_2d(
1121
1628
  void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1122
1629
 
1123
- char * dst_char = (char *) dst;
1630
+ cudaMemcpyKind kind;
1631
+ char * src_ptr;
1632
+ if (src->backend == GGML_BACKEND_CPU) {
1633
+ kind = cudaMemcpyHostToDevice;
1634
+ src_ptr = (char *) src->data;
1635
+ } else if (src->backend == GGML_BACKEND_GPU) {
1636
+ kind = cudaMemcpyDeviceToDevice;
1637
+ struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
1638
+ int id;
1639
+ CUDA_CHECK(cudaGetDevice(&id));
1640
+ src_ptr = (char *) extra->data_device[id];
1641
+ } else {
1642
+ GGML_ASSERT(false);
1643
+ }
1644
+ char * dst_ptr = (char *) dst;
1645
+
1124
1646
  const int64_t ne0 = src->ne[0];
1125
1647
  const int64_t nb0 = src->nb[0];
1126
1648
  const int64_t nb1 = src->nb[1];
@@ -1131,17 +1653,17 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(
1131
1653
  const int64_t bs = ggml_blck_size(type);
1132
1654
  int64_t i1_diff = i1_high - i1_low;
1133
1655
 
1134
- const void * x = (const void *) ((const char *) src->data + i1_low*nb1 + i2*nb2 + i3*nb3);
1656
+ const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
1135
1657
  if (nb0 == ts && nb1 == ts*ne0/bs) {
1136
- return cudaMemcpyAsync(dst_char, x, i1_diff*nb1, cudaMemcpyHostToDevice, stream);
1658
+ return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
1137
1659
  } else if (nb0 == ts) {
1138
- return cudaMemcpy2DAsync(dst_char, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyHostToDevice, stream);
1660
+ return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
1139
1661
  } else {
1140
1662
  for (int64_t i1 = 0; i1 < i1_diff; i1++) {
1141
1663
  const void * rx = (const void *) ((const char *) x + i1*nb1);
1142
- void * rd = (void *) (dst_char + i1*ts*ne0/bs);
1664
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
1143
1665
  // pretend the row is a matrix with cols=1
1144
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
1666
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
1145
1667
  if (r != cudaSuccess) return r;
1146
1668
  }
1147
1669
  return cudaSuccess;
@@ -1260,21 +1782,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1260
1782
  const int64_t ne00 = src0->ne[0];
1261
1783
  const int64_t nrows = i01_high - i01_low;
1262
1784
 
1785
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
1786
+ #ifdef GGML_CUDA_DMMV_F16
1787
+ size_t ash;
1788
+ dfloat * src1_dfloat = nullptr; // dfloat == half
1789
+
1790
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
1791
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
1792
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
1793
+
1794
+ if (src1_convert_f16) {
1795
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
1796
+ ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
1797
+ ne00, 1, sizeof(float), 0, 0,
1798
+ ne00, 1, sizeof(half), 0, 0, cudaStream_main);
1799
+ }
1800
+ #else
1801
+ dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
1802
+ #endif // GGML_CUDA_DMMV_F16
1803
+
1263
1804
  switch (src0->type) {
1264
1805
  case GGML_TYPE_Q4_0:
1265
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1806
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1266
1807
  break;
1267
1808
  case GGML_TYPE_Q4_1:
1268
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1809
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1269
1810
  break;
1270
1811
  case GGML_TYPE_Q5_0:
1271
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1812
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1272
1813
  break;
1273
1814
  case GGML_TYPE_Q5_1:
1274
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1815
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1275
1816
  break;
1276
1817
  case GGML_TYPE_Q8_0:
1277
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1818
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1278
1819
  break;
1279
1820
  case GGML_TYPE_Q2_K:
1280
1821
  dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
@@ -1292,7 +1833,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1292
1833
  dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1293
1834
  break;
1294
1835
  case GGML_TYPE_F16:
1295
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1836
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1296
1837
  break;
1297
1838
  default:
1298
1839
  GGML_ASSERT(false);
@@ -1300,6 +1841,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1300
1841
  }
1301
1842
  CUDA_CHECK(cudaGetLastError());
1302
1843
 
1844
+ #ifdef GGML_CUDA_DMMV_F16
1845
+ if (src1_convert_f16) {
1846
+ ggml_cuda_pool_free(src1_dfloat, ash);
1847
+ }
1848
+ #endif // GGML_CUDA_DMMV_F16
1849
+
1303
1850
  (void) src1;
1304
1851
  (void) dst;
1305
1852
  (void) src0_ddf_i;
@@ -1377,8 +1924,81 @@ inline void ggml_cuda_op_rope(
1377
1924
  (void) i1;
1378
1925
  }
1379
1926
 
1927
+ inline void ggml_cuda_op_diag_mask_inf(
1928
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1929
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1930
+ cudaStream_t & cudaStream_main){
1931
+
1932
+ GGML_ASSERT(src0_ddf_i != nullptr);
1933
+ GGML_ASSERT(dst_ddf_i != nullptr);
1934
+
1935
+ const int64_t ne00 = src0->ne[0];
1936
+ const int64_t ne01 = src0->ne[1];
1937
+ const int64_t i01_diff = i01_high - i01_low;
1938
+
1939
+ const int n_past = ((int32_t *) src1->data)[0];
1940
+
1941
+ // compute
1942
+ diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
1943
+ CUDA_CHECK(cudaGetLastError());
1944
+
1945
+ (void) dst;
1946
+ (void) src0_ddq_i;
1947
+ (void) src1_ddf_i;
1948
+ (void) i02;
1949
+ (void) i1;
1950
+ }
1951
+
1952
+ inline void ggml_cuda_op_soft_max(
1953
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1954
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1955
+ cudaStream_t & cudaStream_main){
1956
+
1957
+ GGML_ASSERT(src0_ddf_i != nullptr);
1958
+ GGML_ASSERT(dst_ddf_i != nullptr);
1959
+
1960
+ const int64_t ne00 = src0->ne[0];
1961
+ const int64_t i01_diff = i01_high - i01_low;
1962
+
1963
+ // compute
1964
+ soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
1965
+ CUDA_CHECK(cudaGetLastError());
1966
+
1967
+ (void) src1;
1968
+ (void) dst;
1969
+ (void) src0_ddq_i;
1970
+ (void) src1_ddf_i;
1971
+ (void) i02;
1972
+ (void) i1;
1973
+ }
1974
+
1975
+ inline void ggml_cuda_op_scale(
1976
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1977
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1978
+ cudaStream_t & cudaStream_main){
1979
+
1980
+ GGML_ASSERT(src0_ddf_i != nullptr);
1981
+ GGML_ASSERT(dst_ddf_i != nullptr);
1982
+
1983
+ const float scale = ((float *) src1->data)[0];
1984
+
1985
+ const int64_t ne00 = src0->ne[0];
1986
+ const int64_t i01_diff = i01_high - i01_low;
1987
+
1988
+ // compute
1989
+ scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
1990
+ CUDA_CHECK(cudaGetLastError());
1991
+
1992
+ (void) src1;
1993
+ (void) dst;
1994
+ (void) src0_ddq_i;
1995
+ (void) src1_ddf_i;
1996
+ (void) i02;
1997
+ (void) i1;
1998
+ }
1999
+
1380
2000
  static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
1381
- ggml_cuda_op_t op, bool src0_needs_f32) {
2001
+ ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
1382
2002
  const int64_t ne00 = src0->ne[0];
1383
2003
  const int64_t ne01 = src0->ne[1];
1384
2004
  const int64_t ne02 = src0->ne[2];
@@ -1401,21 +2021,27 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1401
2021
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
1402
2022
 
1403
2023
  // strides for iteration over dims 3 and 2
1404
- const int64_t src0_stride = ne00 * ne01;
1405
- const int64_t src1_stride = ne10 * ne11;
1406
- const int64_t dst_stride = ne0 * ne1;
1407
- const int64_t num_iters = ne02 * ne03;
2024
+ const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
2025
+ const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
2026
+ const int64_t src0_stride = ne00 * ne01 * stride_mod;
2027
+ const int64_t src1_stride = ne10 * ne11 * stride_mod;
2028
+ const int64_t dst_stride = ne0 * ne1 * stride_mod;
1408
2029
 
1409
2030
  const size_t src0_ts = ggml_type_size(src0->type);
1410
2031
  const size_t src0_bs = ggml_blck_size(src0->type);
1411
2032
 
1412
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2033
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
1413
2034
  struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
1414
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2035
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
1415
2036
 
1416
2037
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
2038
+ const bool src0_is_contiguous = ggml_is_contiguous(src0);
1417
2039
  const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
1418
2040
 
2041
+ const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
2042
+ const bool src1_stays_on_host = use_src1 && (
2043
+ dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
2044
+
1419
2045
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
1420
2046
 
1421
2047
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
@@ -1424,13 +2050,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1424
2050
  char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
1425
2051
  float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
1426
2052
  float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
1427
- float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
2053
+ float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
1428
2054
 
1429
2055
  // asq = actual size quantized, asf = actual size float
1430
2056
  size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
1431
2057
  size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
1432
2058
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
1433
- size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
2059
+ size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
2060
+
2061
+ // if multiple GPUs are used they need to wait for the main GPU to finish
2062
+ if (split && g_device_count > 1) {
2063
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2064
+ CUDA_CHECK(cudaDeviceSynchronize());
2065
+ }
1434
2066
 
1435
2067
  for (int id = 0; id < g_device_count; ++id) {
1436
2068
  if (!split && id != g_main_device) {
@@ -1443,9 +2075,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1443
2075
  int64_t row_low, row_high;
1444
2076
  if (split) {
1445
2077
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
1446
- row_low -= row_low % GGML_CUDA_DMMV_Y;
1447
2078
  row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
1448
- row_high -= row_high % GGML_CUDA_DMMV_Y;
1449
2079
  } else {
1450
2080
  row_low = 0;
1451
2081
  row_high = nrows0;
@@ -1458,7 +2088,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1458
2088
 
1459
2089
  cudaSetDevice(id);
1460
2090
 
1461
- if (src0_on_device) {
2091
+ if (src0_on_device && src0_is_contiguous) {
1462
2092
  if (src0_is_f32) {
1463
2093
  src0_ddf[id] = (float *) src0_extra->data_device[id];
1464
2094
  } else {
@@ -1476,8 +2106,8 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1476
2106
  src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
1477
2107
  }
1478
2108
 
1479
- if (use_src1) {
1480
- if (src1_on_device) {
2109
+ if (use_src1 && !src1_stays_on_host) {
2110
+ if (src1_on_device && src1_is_contiguous) {
1481
2111
  src1_ddf[id] = (float *) src1_extra->data_device[id];
1482
2112
  } else {
1483
2113
  src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
@@ -1490,26 +2120,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1490
2120
  dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
1491
2121
  }
1492
2122
 
1493
- for (int64_t i03 = 0; i03 < ne03; i03++) {
2123
+ const int64_t i03_max = flatten_rows ? 1 : ne03;
2124
+ const int64_t i02_max = flatten_rows ? 1 : ne02;
2125
+ const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
2126
+
2127
+ for (int64_t i03 = 0; i03 < i03_max; i03++) {
1494
2128
  const int64_t i13 = i03 % ne13;
1495
- for (int64_t i02 = 0; i02 < ne02; i02++) {
2129
+ for (int64_t i02 = 0; i02 < i02_max; i02++) {
1496
2130
  const int64_t i12 = i02 % ne12;
1497
2131
 
1498
2132
  const int64_t i0 = i03*ne02 + i02;
1499
- const int64_t i0_offset_low = row_low/ne01;
1500
- const int64_t i0_offset_high = row_high/ne01;
2133
+
2134
+ // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
2135
+ const int64_t i0_offset_low = row_low/rows_per_iter;
2136
+ const int64_t i0_offset_high = row_high/rows_per_iter;
1501
2137
 
1502
2138
  int64_t i01_low = 0;
1503
- int64_t i01_high = ne01;
2139
+ int64_t i01_high = rows_per_iter;
1504
2140
  if (split) {
1505
2141
  if (i0 < i0_offset_low || i0 > i0_offset_high) {
1506
2142
  continue;
1507
2143
  }
1508
2144
  if (i0 == i0_offset_low) {
1509
- i01_low = row_low % ne01;
2145
+ i01_low = row_low % rows_per_iter;
1510
2146
  }
1511
2147
  if (i0 == i0_offset_high) {
1512
- i01_high = row_high % ne01;
2148
+ i01_high = row_high % rows_per_iter;
1513
2149
  }
1514
2150
  }
1515
2151
 
@@ -1518,7 +2154,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1518
2154
  // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
1519
2155
  // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
1520
2156
  GGML_ASSERT(i01_low == 0 || g_device_count > 1);
1521
- GGML_ASSERT(i01_high == ne01 || g_device_count > 1);
2157
+ GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
1522
2158
 
1523
2159
  const int64_t i01_diff = i01_high - i01_low;
1524
2160
  if (i01_diff == 0) {
@@ -1526,24 +2162,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1526
2162
  }
1527
2163
  const int64_t i11 = i13*ne12 + i12;
1528
2164
 
1529
- cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
1530
- cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
1531
- cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2165
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
1532
2166
 
1533
2167
  // for split tensors the data begins at i0 == i0_offset_low
1534
2168
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
1535
2169
  float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
1536
2170
  float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
1537
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
2171
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
1538
2172
 
1539
2173
  // for split tensors the data pointer needs to be rounded down
1540
2174
  // to the bin edge for i03, i02 bins beyond the first
1541
2175
  if (i0 - i0_offset_low > 0) {
2176
+ GGML_ASSERT(!flatten_rows);
1542
2177
  src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
1543
2178
  src0_ddf_i -= (row_low % ne01)*ne00;
1544
- }
1545
- if (i0 - i0_offset_low > 0) {
1546
- dst_ddf_i -= (row_low % ne0)*ne1;
2179
+ dst_ddf_i -= (row_low % ne0)*ne1;
1547
2180
  }
1548
2181
 
1549
2182
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
@@ -1553,38 +2186,41 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1553
2186
  }
1554
2187
 
1555
2188
  // copy src0, src1 to device if necessary
1556
- if (use_src1) {
2189
+ if (use_src1 && !src1_stays_on_host) {
1557
2190
  if (src1->backend == GGML_BACKEND_CPU) {
1558
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_memcpy_src1));
1559
- } else if (src1->backend == GGML_BACKEND_GPU) {
2191
+ GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
2192
+ int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2193
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
2194
+ } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
1560
2195
  if (id != g_main_device) {
2196
+ GGML_ASSERT(!flatten_rows);
1561
2197
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
1562
2198
  src1_ddf_i_source += i11*src1_stride;
1563
2199
  CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
1564
- cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
2200
+ cudaMemcpyDeviceToDevice, cudaStream_main));
1565
2201
  }
2202
+ } else if (src1_on_device && !src1_is_contiguous) {
2203
+ GGML_ASSERT(!split);
2204
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
1566
2205
  } else {
1567
2206
  GGML_ASSERT(false);
1568
2207
  }
1569
2208
  }
1570
- CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
1571
- if (!src0_on_device) {
2209
+
2210
+ if (!src0_on_device || !src0_is_contiguous) {
1572
2211
  if (src0_is_f32) {
1573
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
2212
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
1574
2213
  } else {
1575
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
2214
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
1576
2215
  }
1577
2216
  }
1578
2217
 
1579
- // convert src0 to f32 if it's necessary for the ggml_cuda_op
2218
+ // convert src0 to f32 if it is necessary for the ggml_cuda_op
1580
2219
  if (src0_needs_f32 && !src0_is_f32) {
1581
2220
  to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
1582
2221
  CUDA_CHECK(cudaGetLastError());
1583
2222
  }
1584
2223
 
1585
- // wait with main stream until src1 memcpy is done
1586
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
1587
-
1588
2224
  // do the computation
1589
2225
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
1590
2226
 
@@ -1622,8 +2258,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1622
2258
 
1623
2259
  // wait until each device is finished, then free their buffers
1624
2260
  for (int id = 0; id < g_device_count; ++id) {
2261
+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
2262
+ continue;
2263
+ }
2264
+
1625
2265
  CUDA_CHECK(cudaSetDevice(id));
1626
2266
  CUDA_CHECK(cudaDeviceSynchronize());
2267
+
1627
2268
  if (src0_asq[id] > 0) {
1628
2269
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
1629
2270
  }
@@ -1641,39 +2282,30 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1641
2282
 
1642
2283
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1643
2284
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1644
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true);
2285
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
1645
2286
  }
1646
2287
 
1647
2288
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1648
2289
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1649
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true);
2290
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
1650
2291
  }
1651
2292
 
1652
2293
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1653
2294
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1654
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true);
2295
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
1655
2296
  }
1656
2297
 
1657
2298
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1658
2299
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1659
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true);
2300
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
1660
2301
  }
1661
2302
 
1662
2303
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1663
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
1664
2304
  const int64_t ne10 = src1->ne[0];
1665
2305
 
1666
2306
  const int64_t ne0 = dst->ne[0];
1667
2307
  const int64_t ne1 = dst->ne[1];
1668
2308
 
1669
- // if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) {
1670
- // fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n",
1671
- // src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
1672
- // src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
1673
- // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
1674
- // return false;
1675
- // }
1676
-
1677
2309
  // TODO: find the optimal values for these
1678
2310
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
1679
2311
  src1->type == GGML_TYPE_F32 &&
@@ -1685,23 +2317,152 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
1685
2317
  return false;
1686
2318
  }
1687
2319
 
2320
+ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
2321
+ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
2322
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
2323
+ GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
2324
+ GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
2325
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
2326
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
2327
+
2328
+ const int64_t ne00 = src0->ne[0];
2329
+ const int64_t ne01 = src0->ne[1];
2330
+ const int64_t ne02 = src0->ne[2];
2331
+
2332
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2333
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2334
+
2335
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2336
+ void * src0_ddq = src0_extra->data_device[g_main_device];
2337
+
2338
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2339
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
2340
+
2341
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2342
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2343
+
2344
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2345
+ }
2346
+
2347
+ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
2348
+ GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
2349
+ GGML_ASSERT(!ggml_is_permuted(src0));
2350
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
2351
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
2352
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
2353
+
2354
+ const int64_t ne00 = src0->ne[0];
2355
+ const int64_t ne01 = src0->ne[1];
2356
+ const int64_t ne02 = src0->ne[2];
2357
+
2358
+ const int64_t nb01 = src0->nb[1];
2359
+ const int64_t nb02 = src0->nb[2];
2360
+
2361
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2362
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2363
+
2364
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2365
+ void * src0_ddq = src0_extra->data_device[g_main_device];
2366
+
2367
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2368
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
2369
+
2370
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2371
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2372
+
2373
+ const int row_stride_x = nb01 / sizeof(half);
2374
+ const int channel_stride_x = nb02 / sizeof(half);
2375
+
2376
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2377
+ }
2378
+
1688
2379
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1689
- if (src0->type == GGML_TYPE_F32) {
1690
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
2380
+ bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
2381
+ src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
2382
+
2383
+ if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
2384
+ ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
2385
+ } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
2386
+ ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
2387
+ }else if (src0->type == GGML_TYPE_F32) {
2388
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
1691
2389
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
1692
- if (src1->ne[1] == 1) {
1693
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
2390
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
2391
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
1694
2392
  } else {
1695
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
2393
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
1696
2394
  }
1697
2395
  } else {
1698
2396
  GGML_ASSERT(false);
1699
2397
  }
1700
2398
  }
1701
2399
 
2400
+ void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2401
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2402
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
2403
+ }
2404
+
2405
+ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2406
+ const int64_t ne = ggml_nelements(src0);
2407
+ GGML_ASSERT(ne == ggml_nelements(src1));
2408
+
2409
+ GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2410
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
2411
+
2412
+ GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
2413
+ GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
2414
+
2415
+ const int64_t ne00 = src0->ne[0];
2416
+ const int64_t ne01 = src0->ne[1];
2417
+ GGML_ASSERT(src0->ne[3] == 1);
2418
+
2419
+ const int64_t nb00 = src0->nb[0];
2420
+ const int64_t nb01 = src0->nb[1];
2421
+ const int64_t nb02 = src0->nb[2];
2422
+
2423
+ const int64_t ne10 = src1->ne[0];
2424
+ const int64_t ne11 = src1->ne[1];
2425
+ GGML_ASSERT(src1->ne[3] == 1);
2426
+
2427
+ const int64_t nb10 = src1->nb[0];
2428
+ const int64_t nb11 = src1->nb[1];
2429
+ const int64_t nb12 = src1->nb[2];
2430
+
2431
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2432
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2433
+
2434
+ const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2435
+ const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2436
+
2437
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2438
+ char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
2439
+
2440
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
2441
+ ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
2442
+ ne10, ne11, nb10, nb11, nb12, cudaStream_main);
2443
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
2444
+ ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
2445
+ ne10, ne11, nb10, nb11, nb12, cudaStream_main);
2446
+ } else {
2447
+ GGML_ASSERT(false);
2448
+ }
2449
+
2450
+ (void) dst;
2451
+ }
2452
+
2453
+ void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2454
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2455
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
2456
+ }
2457
+
2458
+ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2459
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2460
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
2461
+ }
2462
+
1702
2463
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1703
2464
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1704
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
2465
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
1705
2466
  }
1706
2467
 
1707
2468
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1710,16 +2471,14 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
1710
2471
  (void) dst;
1711
2472
  }
1712
2473
 
1713
- void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
1714
- FILE * fp = fopen(fname, "rb");
2474
+ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
1715
2475
  int nrows = ggml_nrows(tensor);
1716
2476
  const size_t nb1 = tensor->nb[1];
1717
2477
  ggml_backend backend = tensor->backend;
1718
2478
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
2479
+ memset(extra, 0, sizeof(*extra));
1719
2480
 
1720
2481
  for (int id = 0; id < g_device_count; ++id) {
1721
- extra->data_device[id] = nullptr;
1722
-
1723
2482
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
1724
2483
  continue;
1725
2484
  }
@@ -1732,10 +2491,7 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
1732
2491
  row_high = nrows;
1733
2492
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
1734
2493
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
1735
- row_low -= row_low % GGML_CUDA_DMMV_Y;
1736
2494
  row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
1737
- row_high -= row_high % GGML_CUDA_DMMV_Y;
1738
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1739
2495
  } else {
1740
2496
  GGML_ASSERT(false);
1741
2497
  }
@@ -1745,35 +2501,19 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
1745
2501
 
1746
2502
  int64_t nrows_split = row_high - row_low;
1747
2503
 
1748
- const size_t offset_split = offset + row_low*nb1;
2504
+ const size_t offset_split = row_low*nb1;
1749
2505
  const size_t size = ggml_nbytes_split(tensor, nrows_split);
1750
2506
 
1751
2507
  void * buf;
1752
2508
  CUDA_CHECK(cudaMalloc(&buf, size));
1753
- void * buf_host = malloc(size);
1754
-
1755
- #ifdef _WIN32
1756
- int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
1757
- #else
1758
- int ret = fseek(fp, (long) offset_split, SEEK_SET);
1759
- #endif
1760
- GGML_ASSERT(ret == 0); // same
1761
-
1762
- size_t ret2 = fread(buf_host, size, 1, fp);
1763
- if (ret2 != 1) {
1764
- fprintf(stderr, "unexpectedly reached end of file");
1765
- exit(1);
1766
- }
2509
+ void * buf_host = (char*)data + offset_split;
1767
2510
 
1768
2511
  cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
1769
- cudaDeviceSynchronize();
1770
2512
 
1771
- free(buf_host);
1772
2513
  extra->data_device[id] = buf;
1773
2514
  }
1774
2515
 
1775
2516
  tensor->extra = extra;
1776
- fclose(fp);
1777
2517
  }
1778
2518
 
1779
2519
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
@@ -1795,47 +2535,78 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
1795
2535
  delete extra;
1796
2536
  }
1797
2537
 
1798
- void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
1799
- if (tensor->src0 != nullptr && tensor->src0->op == GGML_OP_RESHAPE) {
1800
- ggml_cuda_assign_buffers(tensor);
2538
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2539
+ if (scratch && g_scratch_size == 0) {
2540
+ return;
1801
2541
  }
1802
2542
 
1803
- const size_t size = ggml_nbytes(tensor);
1804
- GGML_ASSERT(size <= g_scratch_size);
1805
- if (g_scratch_offset + size > g_scratch_size) {
1806
- g_scratch_offset = 0;
2543
+ // recursively assign CUDA buffers until a compute tensor is found
2544
+ if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2545
+ const ggml_op src0_op = tensor->src0->op;
2546
+ if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2547
+ ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
2548
+ }
2549
+ }
2550
+ if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2551
+ ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
1807
2552
  }
1808
2553
 
1809
2554
  tensor->backend = GGML_BACKEND_GPU;
1810
2555
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
1811
2556
 
1812
- bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data;
2557
+ const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2558
+ tensor->op == GGML_OP_VIEW;
2559
+ const size_t size = ggml_nbytes(tensor);
1813
2560
 
1814
2561
  CUDA_CHECK(cudaSetDevice(g_main_device));
1815
2562
  if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
1816
2563
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
1817
- extra->data_device[g_main_device] = src0_extra->data_device;
1818
- GGML_ASSERT(false);
1819
- } else {
2564
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2565
+ size_t offset = 0;
2566
+ if (tensor->op == GGML_OP_VIEW) {
2567
+ memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
2568
+ }
2569
+ extra->data_device[g_main_device] = src0_ddc + offset;
2570
+ } else if (tensor->op == GGML_OP_CPY) {
2571
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
2572
+ void * src1_ddv = src1_extra->data_device[g_main_device];
2573
+ extra->data_device[g_main_device] = src1_ddv;
2574
+ } else if (scratch) {
2575
+ GGML_ASSERT(size <= g_scratch_size);
2576
+ if (g_scratch_offset + size > g_scratch_size) {
2577
+ g_scratch_offset = 0;
2578
+ }
2579
+
1820
2580
  char * data = (char *) g_scratch_buffer;
1821
2581
  if (data == nullptr) {
1822
2582
  CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
1823
2583
  g_scratch_buffer = data;
1824
2584
  }
1825
2585
  extra->data_device[g_main_device] = data + g_scratch_offset;
1826
- }
1827
2586
 
1828
- // fprintf(stderr, "data=%p offset=%ld data_device=%p\n", data, g_scratch_offset, extra->data_device[0]);
1829
- g_scratch_offset += size;
1830
- // fprintf(stderr, "%s: scratch %d, %p - %p\n",
1831
- // tensor->name, g_scratch_index, data + g_scratch_offset, data + g_scratch_offset + size);
2587
+ g_scratch_offset += size;
2588
+
2589
+ GGML_ASSERT(g_scratch_offset <= g_scratch_size);
2590
+ } else { // allocate new buffers outside of scratch
2591
+ void * data;
2592
+ CUDA_CHECK(cudaMalloc(&data, size));
2593
+ CUDA_CHECK(cudaMemset(data, 0, size));
2594
+ extra->data_device[g_main_device] = data;
2595
+ }
1832
2596
 
1833
- GGML_ASSERT(g_scratch_offset <= g_scratch_size);
1834
2597
  tensor->extra = extra;
1835
2598
  }
1836
2599
 
2600
+ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2601
+ ggml_cuda_assign_buffers_impl(tensor, true);
2602
+ }
2603
+
2604
+ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2605
+ ggml_cuda_assign_buffers_impl(tensor, false);
2606
+ }
2607
+
1837
2608
  void ggml_cuda_set_main_device(int main_device) {
1838
- if (main_device > g_device_count) {
2609
+ if (main_device >= g_device_count) {
1839
2610
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
1840
2611
  main_device, g_device_count, g_main_device);
1841
2612
  return;
@@ -1852,6 +2623,15 @@ void ggml_cuda_set_scratch_size(size_t scratch_size) {
1852
2623
  g_scratch_size = scratch_size;
1853
2624
  }
1854
2625
 
2626
+ void ggml_cuda_free_scratch() {
2627
+ if (g_scratch_buffer == nullptr) {
2628
+ return;
2629
+ }
2630
+
2631
+ CUDA_CHECK(cudaFree(g_scratch_buffer));
2632
+ g_scratch_buffer = nullptr;
2633
+ }
2634
+
1855
2635
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
1856
2636
  ggml_cuda_func_t func;
1857
2637
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -1889,12 +2669,39 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
1889
2669
  }
1890
2670
  func = ggml_cuda_mul_mat;
1891
2671
  break;
2672
+ case GGML_OP_SCALE:
2673
+ if (!any_on_device) {
2674
+ return false;
2675
+ }
2676
+ func = ggml_cuda_scale;
2677
+ break;
2678
+ case GGML_OP_CPY:
2679
+ if (!any_on_device) {
2680
+ return false;
2681
+ }
2682
+ func = ggml_cuda_cpy;
2683
+ break;
1892
2684
  case GGML_OP_RESHAPE:
2685
+ case GGML_OP_VIEW:
2686
+ case GGML_OP_PERMUTE:
2687
+ case GGML_OP_TRANSPOSE:
1893
2688
  if (!any_on_device) {
1894
2689
  return false;
1895
2690
  }
1896
2691
  func = ggml_cuda_nop;
1897
2692
  break;
2693
+ case GGML_OP_DIAG_MASK_INF:
2694
+ if (!any_on_device) {
2695
+ return false;
2696
+ }
2697
+ func = ggml_cuda_diag_mask_inf;
2698
+ break;
2699
+ case GGML_OP_SOFT_MAX:
2700
+ if (!any_on_device) {
2701
+ return false;
2702
+ }
2703
+ func = ggml_cuda_soft_max;
2704
+ break;
1898
2705
  case GGML_OP_ROPE:
1899
2706
  if (!any_on_device) {
1900
2707
  return false;