llama_cpp 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  #include <cstddef>
2
2
  #include <cstdint>
3
+ #include <limits>
3
4
  #include <stdint.h>
4
5
  #include <stdio.h>
5
6
  #include <atomic>
@@ -12,6 +13,10 @@
12
13
  #include "ggml-cuda.h"
13
14
  #include "ggml.h"
14
15
 
16
+ #if defined(_MSC_VER)
17
+ #pragma warning(disable: 4244 4267) // possible loss of data
18
+ #endif
19
+
15
20
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
16
21
 
17
22
  #define CUDA_CHECK(err) \
@@ -24,7 +29,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
24
29
  } \
25
30
  } while (0)
26
31
 
27
- #if CUDART_VERSION >= 12
32
+ #if CUDART_VERSION >= 12000
28
33
  #define CUBLAS_CHECK(err) \
29
34
  do { \
30
35
  cublasStatus_t err_ = (err); \
@@ -45,9 +50,18 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
45
50
  } while (0)
46
51
  #endif // CUDART_VERSION >= 11
47
52
 
48
- typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
53
+ #ifdef GGML_CUDA_DMMV_F16
54
+ typedef half dfloat; // dequantize float
55
+ typedef half2 dfloat2;
56
+ #else
57
+ typedef float dfloat; // dequantize float
58
+ typedef float2 dfloat2;
59
+ #endif //GGML_CUDA_DMMV_F16
60
+
61
+ typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
49
62
  typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
50
63
  typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
64
+ typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
51
65
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
52
66
  typedef void (*ggml_cuda_op_t)(
53
67
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
@@ -151,7 +165,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
151
165
  #define CUDA_ADD_BLOCK_SIZE 256
152
166
  #define CUDA_MUL_BLOCK_SIZE 256
153
167
  #define CUDA_SILU_BLOCK_SIZE 256
168
+ #define CUDA_CPY_BLOCK_SIZE 32
169
+ #define CUDA_SCALE_BLOCK_SIZE 256
154
170
  #define CUDA_ROPE_BLOCK_SIZE 256
171
+ #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
155
172
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
156
173
 
157
174
  // dmmv = dequantize_mul_mat_vec
@@ -162,6 +179,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
162
179
  #define GGML_CUDA_DMMV_Y 1
163
180
  #endif
164
181
 
182
+ #ifndef K_QUANTS_PER_ITERATION
183
+ #define K_QUANTS_PER_ITERATION 2
184
+ #else
185
+ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
186
+ #endif
187
+
165
188
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
166
189
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
167
190
 
@@ -219,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
219
242
  }
220
243
  }
221
244
 
222
- static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
245
+ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
223
246
  const block_q4_0 * x = (const block_q4_0 *) vx;
224
247
 
225
- const float d = x[ib].d;
248
+ const dfloat d = x[ib].d;
226
249
 
227
- const uint8_t vui = x[ib].qs[iqs];
250
+ const int vui = x[ib].qs[iqs];
228
251
 
229
- const int8_t vi0 = vui & 0xF;
230
- const int8_t vi1 = vui >> 4;
252
+ v.x = vui & 0xF;
253
+ v.y = vui >> 4;
231
254
 
232
- v0 = (vi0 - 8)*d;
233
- v1 = (vi1 - 8)*d;
255
+ #ifdef GGML_CUDA_DMMV_F16
256
+ v = __hsub2(v, {8.0f, 8.0f});
257
+ v = __hmul2(v, {d, d});
258
+ #else
259
+ v.x = (v.x - 8.0f) * d;
260
+ v.y = (v.y - 8.0f) * d;
261
+ #endif // GGML_CUDA_DMMV_F16
234
262
  }
235
263
 
236
- static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
264
+ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
237
265
  const block_q4_1 * x = (const block_q4_1 *) vx;
238
266
 
239
- const float d = x[ib].d;
240
- const float m = x[ib].m;
267
+ const dfloat d = x[ib].d;
268
+ const dfloat m = x[ib].m;
241
269
 
242
- const uint8_t vui = x[ib].qs[iqs];
270
+ const int vui = x[ib].qs[iqs];
243
271
 
244
- const int8_t vi0 = vui & 0xF;
245
- const int8_t vi1 = vui >> 4;
272
+ v.x = vui & 0xF;
273
+ v.y = vui >> 4;
246
274
 
247
- v0 = vi0*d + m;
248
- v1 = vi1*d + m;
275
+ #ifdef GGML_CUDA_DMMV_F16
276
+ v = __hmul2(v, {d, d});
277
+ v = __hadd2(v, {m, m});
278
+ #else
279
+ v.x = (v.x * d) + m;
280
+ v.y = (v.y * d) + m;
281
+ #endif // GGML_CUDA_DMMV_F16
249
282
  }
250
283
 
251
- static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
284
+ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
252
285
  const block_q5_0 * x = (const block_q5_0 *) vx;
253
286
 
254
- const float d = x[ib].d;
287
+ const dfloat d = x[ib].d;
255
288
 
256
289
  uint32_t qh;
257
290
  memcpy(&qh, x[ib].qh, sizeof(qh));
258
291
 
259
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
260
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
292
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
293
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
261
294
 
262
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
263
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
295
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
296
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
264
297
 
265
- v0 = x0*d;
266
- v1 = x1*d;
298
+ #ifdef GGML_CUDA_DMMV_F16
299
+ v = __hsub2(v, {16.0f, 16.0f});
300
+ v = __hmul2(v, {d, d});
301
+ #else
302
+ v.x = (v.x - 16.0f) * d;
303
+ v.y = (v.y - 16.0f) * d;
304
+ #endif // GGML_CUDA_DMMV_F16
267
305
  }
268
306
 
269
- static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
307
+ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
270
308
  const block_q5_1 * x = (const block_q5_1 *) vx;
271
309
 
272
- const float d = x[ib].d;
273
- const float m = x[ib].m;
310
+ const dfloat d = x[ib].d;
311
+ const dfloat m = x[ib].m;
274
312
 
275
313
  uint32_t qh;
276
314
  memcpy(&qh, x[ib].qh, sizeof(qh));
277
315
 
278
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
279
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
316
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
317
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
280
318
 
281
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
282
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
319
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
320
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
283
321
 
284
- v0 = x0*d + m;
285
- v1 = x1*d + m;
322
+ #ifdef GGML_CUDA_DMMV_F16
323
+ v = __hmul2(v, {d, d});
324
+ v = __hadd2(v, {m, m});
325
+ #else
326
+ v.x = (v.x * d) + m;
327
+ v.y = (v.y * d) + m;
328
+ #endif // GGML_CUDA_DMMV_F16
286
329
  }
287
330
 
288
- static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
331
+ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
289
332
  const block_q8_0 * x = (const block_q8_0 *) vx;
290
333
 
291
- const float d = x[ib].d;
334
+ const dfloat d = x[ib].d;
292
335
 
293
- const int8_t vi0 = x[ib].qs[iqs + 0];
294
- const int8_t vi1 = x[ib].qs[iqs + 1];
336
+ v.x = x[ib].qs[iqs + 0];
337
+ v.y = x[ib].qs[iqs + 1];
295
338
 
296
- v0 = vi0*d;
297
- v1 = vi1*d;
339
+ #ifdef GGML_CUDA_DMMV_F16
340
+ v = __hmul2(v, {d, d});
341
+ #else
342
+ v.x *= d;
343
+ v.y *= d;
344
+ #endif // GGML_CUDA_DMMV_F16
298
345
  }
299
346
 
300
347
  //================================== k-quants
@@ -321,37 +368,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
321
368
 
322
369
  }
323
370
 
324
- static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
325
-
326
- const block_q2_K * x = (const block_q2_K *) vx;
327
-
328
- // if n is 0, we want to do the lower 128, else the upper 128,
329
- // covering y[l+0], y[l+32], y[l+64], y[l+96] and
330
- // y[l+16], y[l+48], y[l+80], y[l+112]
331
- int n = iqs/128; // 0 or 1
332
- int r = iqs - 128*n; // 0...120 in steps of 8
333
- int l = r/8; // 0...15 in steps of 1
334
-
335
- const float * y = yy + 128*n + l;
336
- const uint8_t * q = x[ib].qs + 32*n + l;
337
- const uint8_t * s = x[ib].scales + 8*n;
338
-
339
- const float dall = x[ib].d;
340
- const float dmin = x[ib].dmin;
341
-
342
- float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
343
- + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
344
- + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
345
- + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
346
- + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
347
- + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
348
- + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
349
- + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
350
-
351
- result = sum;
352
-
353
- }
354
-
355
371
  static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
356
372
 
357
373
  int r = threadIdx.x/4;
@@ -383,51 +399,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
383
399
 
384
400
  }
385
401
 
386
- static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
387
-
388
- const block_q3_K * x = (const block_q3_K *) vx;
389
-
390
- const uint32_t kmask1 = 0x03030303;
391
- const uint32_t kmask2 = 0x0f0f0f0f;
392
-
393
- uint32_t aux[3];
394
- uint32_t utmp[4];
395
-
396
- // if n is 0, we want to do the lower 128, else the upper 128,
397
- // covering y[l+0], y[l+32], y[l+64], y[l+96] and
398
- // y[l+16], y[l+48], y[l+80], y[l+112]
399
- int n = iqs/128; // 0 or 1
400
- int r = iqs - 128*n; // 0...120 in steps of 8
401
- int l = r/8; // 0...15 in steps of 1
402
-
403
- const float * y = yy + 128*n + l;
404
- const uint8_t * q = x[ib].qs + 32*n + l;
405
- const uint8_t * hm = x[ib].hmask + l;
406
- const int8_t * s = (const int8_t *)utmp + 8*n;
407
-
408
- memcpy(aux, x[ib].scales, 12);
409
- utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
410
- utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
411
- utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
412
- utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
413
-
414
- const float dall = x[ib].d;
415
-
416
- const uint8_t m = 1 << (4*n);
417
-
418
- float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
419
- + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
420
- + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
421
- + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
422
- + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
423
- + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
424
- + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
425
- + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
426
-
427
- result = sum * dall;
428
-
429
- }
430
-
431
402
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
432
403
  if (j < 4) {
433
404
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -474,38 +445,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
474
445
  }
475
446
  }
476
447
 
477
- static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
478
-
479
- const block_q4_K * x = (const block_q4_K *) vx;
480
-
481
- // iqs is in 0...248 in steps of 8 =>
482
- const int j = iqs / 64; // j is in 0...3
483
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
484
- const int is = 2*j; // is is in 0...6 in steps of 2
485
-
486
- const float * y = yy + 64*j + ir;
487
- const uint8_t * q = x[ib].qs + 32*j + ir;
488
-
489
- const float dall = x[ib].d;
490
- const float dmin = x[ib].dmin;
491
-
492
- uint8_t sc, m;
493
- get_scale_min_k4(is + 0, x[ib].scales, sc, m);
494
- const float d1 = dall * sc;
495
- const float m1 = dmin * m;
496
- get_scale_min_k4(is + 1, x[ib].scales, sc, m);
497
- const float d2 = dall * sc;
498
- const float m2 = dmin * m;
499
-
500
- float sum = 0;
501
- for (int k = 0; k < 4; ++k) {
502
- sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
503
- sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
504
- }
505
- result = sum;
506
-
507
- }
508
-
509
448
  static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
510
449
  const block_q5_K * x = (const block_q5_K *) vx;
511
450
 
@@ -539,43 +478,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
539
478
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
540
479
  }
541
480
 
542
- static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
543
-
544
- const block_q5_K * x = (const block_q5_K *) vx;
545
-
546
- // iqs is in 0...248 in steps of 8 =>
547
- const int j = iqs / 64; // j is in 0...3
548
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
549
- const int is = 2*j; // is is in 0...6 in steps of 2
550
-
551
- const float * y = yy + 64*j + ir;
552
- const uint8_t * ql = x[ib].qs + 32*j + ir;
553
- const uint8_t * qh = x[ib].qh + ir;
554
-
555
- const float dall = x[ib].d;
556
- const float dmin = x[ib].dmin;
557
-
558
- uint8_t sc, m;
559
- get_scale_min_k4(is + 0, x[ib].scales, sc, m);
560
- const float d1 = dall * sc;
561
- const float m1 = dmin * m;
562
- get_scale_min_k4(is + 1, x[ib].scales, sc, m);
563
- const float d2 = dall * sc;
564
- const float m2 = dmin * m;
565
-
566
- uint8_t hm = 1 << is;
567
- float sum = 0;
568
- for (int k = 0; k < 4; ++k) {
569
- sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
570
- }
571
- hm <<= 1;
572
- for (int k = 0; k < 4; ++k) {
573
- sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
574
- }
575
- result = sum;
576
-
577
- }
578
-
579
481
  static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
580
482
  const block_q6_K * x = (const block_q6_K *) vx;
581
483
 
@@ -601,38 +503,395 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
601
503
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
602
504
  }
603
505
 
604
- static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
506
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
605
507
 
606
- const block_q6_K * x = (const block_q6_K *) vx;
508
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
509
+
510
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
511
+ if (row > nrows) return;
512
+
513
+ const int num_blocks_per_row = ncols / QK_K;
514
+ const int ib0 = row*num_blocks_per_row;
515
+
516
+ const block_q2_K * x = (const block_q2_K *)vx + ib0;
517
+
518
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
519
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
520
+
521
+ const int step = 16/K_QUANTS_PER_ITERATION;
522
+
523
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
524
+ const int in = tid - step*im; // 0...15 or 0...7
525
+
526
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
527
+ const int q_offset = 32*im + l0;
528
+ const int s_offset = 8*im;
529
+ const int y_offset = 128*im + l0;
530
+
531
+ float tmp = 0; // partial sum for thread in warp
532
+
533
+ uint32_t aux[4];
534
+ const uint8_t * d = (const uint8_t *)aux;
535
+ const uint8_t * m = (const uint8_t *)(aux + 2);
536
+
537
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
538
+
539
+ const float * y = yy + i * QK_K + y_offset;
540
+ const uint8_t * q = x[i].qs + q_offset;
541
+
542
+ const float dall = x[i].d;
543
+ const float dmin = x[i].dmin;
544
+
545
+ const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
546
+ aux[0] = a[0] & 0x0f0f0f0f;
547
+ aux[1] = a[1] & 0x0f0f0f0f;
548
+ aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
549
+ aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
550
+
551
+ float sum1 = 0, sum2 = 0;
552
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
553
+ sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
554
+ + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
555
+ + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
556
+ + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
557
+ + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
558
+ + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
559
+ + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
560
+ +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
561
+ sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
562
+ + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
563
+
564
+ }
565
+ tmp += dall * sum1 - dmin * sum2;
566
+
567
+ }
568
+
569
+ // sum up partial sums and write back result
570
+ __syncthreads();
571
+ #pragma unroll
572
+ for (int mask = 16; mask > 0; mask >>= 1) {
573
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
574
+ }
575
+
576
+ if (tid == 0) {
577
+ dst[row] = tmp;
578
+ }
579
+ }
580
+
581
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
582
+
583
+ const uint16_t kmask1 = 0x0303;
584
+ const uint16_t kmask2 = 0x0f0f;
585
+
586
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
587
+ if (row > nrows) return;
588
+
589
+ const int num_blocks_per_row = ncols / QK_K;
590
+ const int ib0 = row*num_blocks_per_row;
591
+
592
+ const block_q3_K * x = (const block_q3_K *)vx + ib0;
593
+
594
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
595
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
596
+
597
+ const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
598
+ const int step = 16/K_QUANTS_PER_ITERATION;
599
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
600
+ const int in = tid - step*im; // 0....15 or 0...7
601
+
602
+ const uint8_t m = 1 << (4*im);
603
+
604
+ const int l0 = n*in; // 0...15 or 0...14 in steps of 2
605
+ const int q_offset = 32*im + l0;
606
+ const int y_offset = 128*im + l0;
607
+
608
+ uint16_t utmp[4];
609
+ const int8_t * s = (const int8_t *)utmp;
610
+
611
+ const uint16_t s_shift = 4*im;
612
+
613
+ float tmp = 0; // partial sum for thread in warp
614
+
615
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
616
+
617
+ const float * y = yy + i * QK_K + y_offset;
618
+ const uint8_t * q = x[i].qs + q_offset;
619
+ const uint8_t * h = x[i].hmask + l0;
620
+
621
+ const uint16_t * a = (const uint16_t *)x[i].scales;
622
+ utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
623
+ utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
624
+ utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
625
+ utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
626
+
627
+ const float d = x[i].d;
628
+
629
+ float sum = 0;
630
+ for (int l = 0; l < n; ++l) {
631
+ sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
632
+ + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
633
+ + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
634
+ + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
635
+ sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
636
+ + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
637
+ + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
638
+ + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
639
+ }
640
+ tmp += d * sum;
641
+
642
+ }
643
+
644
+ // sum up partial sums and write back result
645
+ __syncthreads();
646
+ #pragma unroll
647
+ for (int mask = 16; mask > 0; mask >>= 1) {
648
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
649
+ }
650
+
651
+ if (tid == 0) {
652
+ dst[row] = tmp;
653
+ }
654
+ }
655
+
656
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
657
+
658
+ const uint16_t kmask1 = 0x3f3f;
659
+ const uint16_t kmask2 = 0x0f0f;
660
+ const uint16_t kmask3 = 0xc0c0;
661
+
662
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
663
+ if (row > nrows) return;
664
+ const int num_blocks_per_row = ncols / QK_K;
665
+ const int ib0 = row*num_blocks_per_row;
666
+
667
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
668
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
669
+
670
+ const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
671
+
672
+ const int il = tid/step; // 0...3
673
+ const int ir = tid - step*il; // 0...7 or 0...3
674
+ const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
675
+
676
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
677
+ const int in = il%2;
678
+
679
+ const int l0 = n*(2*ir + in);
680
+ const int q_offset = 32*im + l0;
681
+ const int y_offset = 64*im + l0;
682
+
683
+ uint16_t aux[4];
684
+ const uint8_t * sc = (const uint8_t *)aux;
685
+
686
+ const block_q4_K * x = (const block_q4_K *)vx + ib0;
687
+
688
+ float tmp = 0; // partial sum for thread in warp
689
+
690
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
691
+
692
+ const uint8_t * q1 = x[i].qs + q_offset;
693
+ const uint8_t * q2 = q1 + 64;
694
+ const float * y1 = yy + i*QK_K + y_offset;
695
+ const float * y2 = y1 + 128;
696
+
697
+ const float dall = x[i].d;
698
+ const float dmin = x[i].dmin;
699
+
700
+ const uint16_t * a = (const uint16_t *)x[i].scales;
701
+ aux[0] = a[im+0] & kmask1;
702
+ aux[1] = a[im+2] & kmask1;
703
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
704
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
705
+
706
+ float4 s = {0.f, 0.f, 0.f, 0.f};
707
+ float smin = 0;
708
+ for (int l = 0; l < n; ++l) {
709
+ s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
710
+ s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
711
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
712
+ }
713
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
714
+
715
+ }
716
+
717
+ // sum up partial sums and write back result
718
+ __syncthreads();
719
+ #pragma unroll
720
+ for (int mask = 16; mask > 0; mask >>= 1) {
721
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
722
+ }
723
+
724
+ if (tid == 0) {
725
+ dst[row] = tmp;
726
+ }
727
+ }
728
+
729
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
730
+
731
+ const uint16_t kmask1 = 0x3f3f;
732
+ const uint16_t kmask2 = 0x0f0f;
733
+ const uint16_t kmask3 = 0xc0c0;
734
+
735
+ //const int row = blockIdx.x*blockDim.y + threadIdx.y;
736
+ const int row = blockIdx.x;
737
+ const int num_blocks_per_row = ncols / QK_K;
738
+ const int ib0 = row*num_blocks_per_row;
739
+
740
+ const int tid = threadIdx.x/2; // 0...15
741
+ const int ix = threadIdx.x%2;
742
+
743
+ const int il = tid/4; // 0...3
744
+ const int ir = tid - 4*il;// 0...3
745
+ const int n = 2;
607
746
 
608
- const int ip = iqs / 128; // 0 or 1
609
- const int il = (iqs - 128*ip)/8; // 0...15
610
- const int is = 8*ip;
747
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
748
+ const int in = il%2;
611
749
 
612
- const float * y = yy + 128*ip + il;
750
+ const int l0 = n*(2*ir + in);
751
+ const int q_offset = 32*im + l0;
752
+ const int y_offset = 64*im + l0;
613
753
 
614
- const float d = x[ib].d;
754
+ const uint8_t hm1 = 1 << (2*im);
755
+ const uint8_t hm2 = hm1 << 4;
615
756
 
616
- const uint8_t * ql = x[ib].ql + 64*ip + il;
617
- const uint8_t * qh = x[ib].qh + 32*ip + il;
618
- const int8_t * sc = x[ib].scales + is;
757
+ uint16_t aux[4];
758
+ const uint8_t * sc = (const uint8_t *)aux;
619
759
 
620
- result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
621
- + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
622
- + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
623
- + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
624
- + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
625
- + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
626
- + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
627
- + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
760
+ const block_q5_K * x = (const block_q5_K *)vx + ib0;
628
761
 
762
+ float tmp = 0; // partial sum for thread in warp
763
+
764
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
765
+
766
+ const uint8_t * ql1 = x[i].qs + q_offset;
767
+ const uint8_t * ql2 = ql1 + 64;
768
+ const uint8_t * qh = x[i].qh + l0;
769
+ const float * y1 = yy + i*QK_K + y_offset;
770
+ const float * y2 = y1 + 128;
771
+
772
+ const float dall = x[i].d;
773
+ const float dmin = x[i].dmin;
774
+
775
+ const uint16_t * a = (const uint16_t *)x[i].scales;
776
+ aux[0] = a[im+0] & kmask1;
777
+ aux[1] = a[im+2] & kmask1;
778
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
779
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
780
+
781
+ float4 sum = {0.f, 0.f, 0.f, 0.f};
782
+ float smin = 0;
783
+ for (int l = 0; l < n; ++l) {
784
+ sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
785
+ + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
786
+ sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
787
+ + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
788
+ sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
789
+ + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
790
+ sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
791
+ + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
792
+ smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
793
+ + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
794
+ }
795
+ tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
796
+
797
+ }
798
+
799
+ // sum up partial sums and write back result
800
+ __syncthreads();
801
+ #pragma unroll
802
+ for (int mask = 16; mask > 0; mask >>= 1) {
803
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
804
+ }
805
+
806
+ if (tid == 0) {
807
+ dst[row] = tmp;
808
+ }
629
809
  }
630
810
 
631
- static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
811
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
812
+
813
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
814
+
815
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
816
+ if (row > nrows) return;
817
+
818
+ const int num_blocks_per_row = ncols / QK_K;
819
+ const int ib0 = row*num_blocks_per_row;
820
+
821
+ const block_q6_K * x = (const block_q6_K *)vx + ib0;
822
+
823
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
824
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
825
+
826
+ const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
827
+
828
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
829
+ const int in = tid - step*im; // 0...15 or 0...7
830
+
831
+ #if K_QUANTS_PER_ITERATION == 1
832
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
833
+ const int is = 0;
834
+ #else
835
+ const int l0 = 4 * in; // 0, 4, 8, ..., 28
836
+ const int is = in / 4;
837
+ #endif
838
+ const int ql_offset = 64*im + l0;
839
+ const int qh_offset = 32*im + l0;
840
+ const int s_offset = 8*im + is;
841
+ const int y_offset = 128*im + l0;
842
+
843
+ float tmp = 0; // partial sum for thread in warp
844
+
845
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
846
+
847
+ const float * y = yy + i * QK_K + y_offset;
848
+ const uint8_t * ql = x[i].ql + ql_offset;
849
+ const uint8_t * qh = x[i].qh + qh_offset;
850
+ const int8_t * s = x[i].scales + s_offset;
851
+
852
+ const float d = x[i].d;
853
+
854
+ #if K_QUANTS_PER_ITERATION == 1
855
+ float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
856
+ + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
857
+ + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
858
+ + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
859
+ + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
860
+ + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
861
+ + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
862
+ +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
863
+ tmp += sum;
864
+ #else
865
+ float sum = 0;
866
+ for (int l = 0; l < 4; ++l) {
867
+ sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
868
+ + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
869
+ + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
870
+ + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
871
+ }
872
+ tmp += sum;
873
+ #endif
874
+
875
+ }
876
+
877
+ // sum up partial sums and write back result
878
+ __syncthreads();
879
+ #pragma unroll
880
+ for (int mask = 16; mask > 0; mask >>= 1) {
881
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
882
+ }
883
+
884
+ if (tid == 0) {
885
+ dst[row] = tmp;
886
+ }
887
+ }
888
+
889
+ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
632
890
  const half * x = (const half *) vx;
633
891
 
634
- v0 = __half2float(x[ib + iqs + 0]);
635
- v1 = __half2float(x[ib + iqs + 1]);
892
+ // automatic half -> float type cast if dfloat == float
893
+ v.x = x[ib + iqs + 0];
894
+ v.y = x[ib + iqs + 1];
636
895
  }
637
896
 
638
897
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -649,23 +908,35 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
649
908
  const int y_offset = qr == 1 ? 1 : qk/2;
650
909
 
651
910
  // dequantize
652
- float & v0 = y[iybs + iqs + 0];
653
- float & v1 = y[iybs + iqs + y_offset];
654
- dequantize_kernel(vx, ib, iqs, v0, v1);
911
+ dfloat2 v;
912
+ dequantize_kernel(vx, ib, iqs, v);
913
+
914
+ y[iybs + iqs + 0] = v.x;
915
+ y[iybs + iqs + y_offset] = v.y;
655
916
  }
656
917
 
657
918
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
658
- static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
919
+ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
659
920
  // qk = quantized weights per x block
660
921
  // qr = number of quantized weights per data value in x block
661
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
922
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
923
+
924
+ if (row >= nrows) {
925
+ return;
926
+ }
927
+
662
928
  const int tid = threadIdx.x;
663
929
 
664
930
  const int iter_stride = 2*GGML_CUDA_DMMV_X;
665
931
  const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
666
932
  const int y_offset = qr == 1 ? 1 : qk/2;
667
933
 
668
- float tmp = 0.0f; // partial sum for thread in warp
934
+ // partial sum for each thread
935
+ #ifdef GGML_CUDA_DMMV_F16
936
+ half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
937
+ #else
938
+ float tmp = 0.0f;
939
+ #endif // GGML_CUDA_DMMV_F16
669
940
 
670
941
  for (int i = 0; i < ncols; i += iter_stride) {
671
942
  const int col = i + vals_per_iter*tid;
@@ -679,14 +950,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
679
950
  // process 2 vals per j iter
680
951
 
681
952
  // dequantize
682
- float v0, v1;
683
- dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
684
953
  // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
954
+ dfloat2 v;
955
+ dequantize_kernel(vx, ib, iqs + j/qr, v);
685
956
 
686
957
  // matrix multiplication
687
- tmp += v0 * y[iybs + iqs + j/qr + 0];
688
- tmp += v1 * y[iybs + iqs + j/qr + y_offset];
689
958
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
959
+ #ifdef GGML_CUDA_DMMV_F16
960
+ tmp += __hmul2(v, {
961
+ y[iybs + iqs + j/qr + 0],
962
+ y[iybs + iqs + j/qr + y_offset]
963
+ });
964
+ #else
965
+ tmp += v.x * y[iybs + iqs + j/qr + 0];
966
+ tmp += v.y * y[iybs + iqs + j/qr + y_offset];
967
+ #endif // GGML_CUDA_DMMV_F16
690
968
  }
691
969
  }
692
970
 
@@ -698,64 +976,232 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
698
976
  }
699
977
 
700
978
  if (tid == 0) {
979
+ #ifdef GGML_CUDA_DMMV_F16
980
+ dst[row] = tmp.x + tmp.y;
981
+ #else
701
982
  dst[row] = tmp;
983
+ #endif // GGML_CUDA_DMMV_F16
702
984
  }
703
985
  }
704
986
 
705
- template <int n_thread, dot_kernel_k_t dot_kernel>
706
- static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols) {
707
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
708
- const int tid = threadIdx.x;
987
+ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
988
+ const half * x = (half *) vx;
709
989
 
710
- const int iter_stride = QK_K;
711
- const int vals_per_iter = iter_stride / n_thread;
712
- const int num_blocks_per_row = ncols / QK_K;
713
- const int ib0 = row*num_blocks_per_row;
990
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
991
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
714
992
 
715
- float tmp = 0; // partial sum for thread in warp
993
+ const int nrows_y = ncols_x;
994
+ const int nrows_dst = nrows_x;
995
+ const int row_dst = row_x;
716
996
 
717
- for (int i = 0; i < ncols; i += iter_stride) {
718
- const int col = i + vals_per_iter*tid;
719
- const int ib = ib0 + col/QK_K; // x block index
720
- const int iqs = col%QK_K; // x quant index
721
- const int iybs = col - col%QK_K; // y block start index
997
+ float tmp = 0.0f;
998
+
999
+ for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
1000
+ const int col_x = col_x0 + threadIdx.x;
1001
+
1002
+ if (col_x >= ncols_x) {
1003
+ break;
1004
+ }
1005
+
1006
+ // x is transposed and permuted
1007
+ const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
1008
+ const float xi = __half2float(x[ix]);
1009
+
1010
+ const int row_y = col_x;
1011
+
1012
+
1013
+ // y is not transposed but permuted
1014
+ const int iy = channel*nrows_y + row_y;
1015
+
1016
+ tmp += xi * y[iy];
1017
+ }
1018
+
1019
+ // dst is not transposed and not permuted
1020
+ const int idst = channel*nrows_dst + row_dst;
1021
+
1022
+ // sum up partial sums and write back result
1023
+ __syncthreads();
1024
+ #pragma unroll
1025
+ for (int mask = 16; mask > 0; mask >>= 1) {
1026
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1027
+ }
1028
+
1029
+ if (threadIdx.x == 0) {
1030
+ dst[idst] = tmp;
1031
+ }
1032
+ }
1033
+
1034
+ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1035
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1036
+ const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
1037
+
1038
+ const half * x = (half *) vx;
1039
+
1040
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1041
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1042
+
1043
+ const int nrows_y = ncols_x;
1044
+ const int nrows_dst = nrows_x;
1045
+ const int row_dst = row_x;
1046
+
1047
+ const int idst = channel*nrows_dst + row_dst;
1048
+
1049
+ float tmp = 0.0f;
1050
+
1051
+ for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
1052
+ const int col_x = col_x0 + threadIdx.x;
1053
+
1054
+ if (col_x >= ncols_x) {
1055
+ break;
1056
+ }
1057
+
1058
+ const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
1059
+ const float xi = __half2float(x[ix]);
1060
+
1061
+ const int row_y = col_x;
1062
+
1063
+ const int iy = channel*nrows_y + row_y;
1064
+
1065
+ tmp += xi * y[iy];
1066
+ }
1067
+
1068
+ // sum up partial sums and write back result
1069
+ __syncthreads();
1070
+ #pragma unroll
1071
+ for (int mask = 16; mask > 0; mask >>= 1) {
1072
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1073
+ }
1074
+
1075
+ if (threadIdx.x == 0) {
1076
+ dst[idst] = tmp;
1077
+ }
1078
+ }
1079
+
1080
+ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1081
+ const float * xi = (float *) cxi;
1082
+ float * dsti = (float *) cdsti;
1083
+
1084
+ *dsti = *xi;
1085
+ }
1086
+
1087
+ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1088
+ const float * xi = (float *) cxi;
1089
+ half * dsti = (half *) cdsti;
1090
+
1091
+ *dsti = __float2half(*xi);
1092
+ }
1093
+
1094
+ template <cpy_kernel_t cpy_1>
1095
+ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
1096
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1097
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
1098
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
1099
+
1100
+ if (i >= ne) {
1101
+ return;
1102
+ }
1103
+
1104
+ // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
1105
+ // then combine those indices with the corresponding byte offsets to get the total offsets
1106
+ const int i02 = i / (ne00*ne01);
1107
+ const int i01 = (i - i02*ne01*ne00) / ne00;
1108
+ const int i00 = i - i02*ne01*ne00 - i01*ne00;
1109
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
1110
+
1111
+ const int i12 = i / (ne10*ne11);
1112
+ const int i11 = (i - i12*ne10*ne11) / ne10;
1113
+ const int i10 = i - i12*ne10*ne11 - i11*ne10;
1114
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
1115
+
1116
+ cpy_1(cx + x_offset, cdst + dst_offset);
1117
+ }
1118
+
1119
+ // rope == RoPE == rotary positional embedding
1120
+ static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
1121
+ const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
1122
+
1123
+ if (col >= ncols) {
1124
+ return;
1125
+ }
1126
+
1127
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1128
+ const int i = row*ncols + col;
1129
+
1130
+ const float theta = p*powf(theta_scale, col/2);
1131
+ const float sin_theta = sinf(theta);
1132
+ const float cos_theta = cosf(theta);
1133
+
1134
+ const float x0 = x[i + 0];
1135
+ const float x1 = x[i + 1];
1136
+
1137
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
1138
+ dst[i + 1] = x0*sin_theta + x1*cos_theta;
1139
+ }
1140
+
1141
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
1142
+ const int col = blockDim.x*blockIdx.x + threadIdx.x;
1143
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1144
+
1145
+ if (col >= ncols) {
1146
+ return;
1147
+ }
1148
+
1149
+ const int i = row*ncols + col;
1150
+ // dst[i] = col > n_past + row ? -INFINITY : x[i];
1151
+ dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
1152
+ }
1153
+
1154
+ // the CUDA soft max implementation differs from the CPU implementation
1155
+ // instead of doubles floats are used
1156
+ // values are also not normalized to the maximum value by subtracting it in the exponential function
1157
+ // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
1158
+ static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
1159
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1160
+ const int block_size = blockDim.x;
1161
+ const int tid = threadIdx.x;
1162
+
1163
+ float tmp = 0.0;
1164
+
1165
+ for (int block_start = 0; block_start < ncols; block_start += block_size) {
1166
+ const int col = block_start + tid;
722
1167
 
723
- float v;
724
- dot_kernel(vx, ib, iqs, y + iybs, v);
725
- tmp += v;
1168
+ if (col >= ncols) {
1169
+ break;
1170
+ }
1171
+
1172
+ const int i = row*ncols + col;
1173
+ const float val = expf(x[i]);
1174
+ tmp += val;
1175
+ dst[i] = val;
726
1176
  }
727
1177
 
728
- // sum up partial sums and write back result
1178
+ // sum up partial sums
729
1179
  __syncthreads();
730
1180
  #pragma unroll
731
1181
  for (int mask = 16; mask > 0; mask >>= 1) {
732
1182
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
733
1183
  }
734
1184
 
735
- if (tid == 0) {
736
- dst[row] = tmp;
1185
+ for (int block_start = 0; block_start < ncols; block_start += block_size) {
1186
+ const int col = block_start + tid;
1187
+
1188
+ if (col >= ncols) {
1189
+ break;
1190
+ }
1191
+
1192
+ const int i = row*ncols + col;
1193
+ dst[i] /= tmp;
737
1194
  }
738
1195
  }
739
1196
 
740
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
741
- const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
1197
+ static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
1198
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
742
1199
 
743
- if (col >= ncols) {
1200
+ if (i >= k) {
744
1201
  return;
745
1202
  }
746
1203
 
747
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
748
- const int i = row*ncols + col;
749
-
750
- const float theta = p*powf(theta_scale, col/2);
751
- const float sin_theta = sinf(theta);
752
- const float cos_theta = cosf(theta);
753
-
754
- const float x0 = x[i + 0];
755
- const float x1 = x[i + 1];
756
-
757
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
758
- dst[i + 1] = x0*sin_theta + x1*cos_theta;
1204
+ dst[i] = scale * x[i];
759
1205
  }
760
1206
 
761
1207
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
@@ -829,75 +1275,91 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
829
1275
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
830
1276
  }
831
1277
 
832
- static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1278
+ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
833
1279
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
834
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1280
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1281
+ const dim3 block_nums(1, block_num_y, 1);
835
1282
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
836
1283
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
837
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1284
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
838
1285
  }
839
1286
 
840
- static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1287
+ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
841
1288
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
842
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1289
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1290
+ const dim3 block_nums(1, block_num_y, 1);
843
1291
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
844
1292
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
845
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1293
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
846
1294
  }
847
1295
 
848
- static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1296
+ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
849
1297
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
850
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1298
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1299
+ const dim3 block_nums(1, block_num_y, 1);
851
1300
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
852
1301
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
853
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1302
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
854
1303
  }
855
1304
 
856
- static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1305
+ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
857
1306
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
858
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1307
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1308
+ const dim3 block_nums(1, block_num_y, 1);
859
1309
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
860
1310
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
861
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1311
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
862
1312
  }
863
1313
 
864
- static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1314
+ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
865
1315
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
866
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1316
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1317
+ const dim3 block_nums(1, block_num_y, 1);
867
1318
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
868
1319
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
869
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1320
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
870
1321
  }
871
1322
 
872
1323
  static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
873
1324
  GGML_ASSERT(ncols % QK_K == 0);
874
- const int ny = 2;
1325
+ const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
1326
+ const int block_num_y = (nrows + ny - 1) / ny;
1327
+ const dim3 block_nums(1, block_num_y, 1);
875
1328
  const dim3 block_dims(32, ny, 1);
876
- dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols);
1329
+ dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
877
1330
  }
878
1331
 
879
1332
  static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
880
1333
  GGML_ASSERT(ncols % QK_K == 0);
881
- const dim3 block_dims(32, 2, 1);
882
- dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
1334
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1335
+ const int block_num_y = (nrows + ny - 1) / ny;
1336
+ const dim3 block_nums(1, block_num_y, 1);
1337
+ const dim3 block_dims(32, ny, 1);
1338
+ dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
883
1339
  }
884
1340
 
885
1341
  static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
886
1342
  GGML_ASSERT(ncols % QK_K == 0);
887
- const dim3 block_dims(32, 2, 1);
888
- dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
1343
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1344
+ const int block_num_y = (nrows + ny - 1) / ny;
1345
+ const dim3 block_nums(1, block_num_y, 1);
1346
+ const dim3 block_dims(32, ny, 1);
1347
+ dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
889
1348
  }
890
1349
 
891
1350
  static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
892
1351
  GGML_ASSERT(ncols % QK_K == 0);
893
- const dim3 block_dims(32, 2, 1);
894
- dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
1352
+ const dim3 block_dims(32, 1, 1);
1353
+ dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
895
1354
  }
896
1355
 
897
1356
  static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
898
1357
  GGML_ASSERT(ncols % QK_K == 0);
899
- const dim3 block_dims(32, 2, 1);
900
- dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
1358
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1359
+ const int block_num_y = (nrows + ny - 1) / ny;
1360
+ const dim3 block_nums(1, block_num_y, 1);
1361
+ const dim3 block_dims(32, ny, 1);
1362
+ dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
901
1363
  }
902
1364
 
903
1365
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -905,12 +1367,13 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
905
1367
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
906
1368
  }
907
1369
 
908
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1370
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
909
1371
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
910
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1372
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1373
+ const dim3 block_nums(1, block_num_y, 1);
911
1374
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
912
1375
  dequantize_mul_mat_vec<1, 1, convert_f16>
913
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
1376
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
914
1377
  }
915
1378
 
916
1379
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
@@ -942,6 +1405,47 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
942
1405
  }
943
1406
  }
944
1407
 
1408
+ static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
1409
+ const dim3 block_nums(1, nrows_x, nchannels_x);
1410
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1411
+ mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
1412
+ }
1413
+
1414
+ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1415
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
1416
+ const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
1417
+
1418
+ const dim3 block_nums(1, nrows_x, nchannels_x);
1419
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1420
+ mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1421
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
1422
+ }
1423
+
1424
+ static void ggml_cpy_f32_f32_cuda(
1425
+ const char * cx, char * cdst, const int ne,
1426
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1427
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
1428
+
1429
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
1430
+ cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
1431
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
1432
+ }
1433
+
1434
+ static void ggml_cpy_f32_f16_cuda(
1435
+ const char * cx, char * cdst, const int ne,
1436
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1437
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
1438
+
1439
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
1440
+ cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
1441
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
1442
+ }
1443
+
1444
+ static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
1445
+ const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
1446
+ scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
1447
+ }
1448
+
945
1449
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
946
1450
  GGML_ASSERT(nrows % 2 == 0);
947
1451
  const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@@ -950,6 +1454,19 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
950
1454
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
951
1455
  }
952
1456
 
1457
+ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
1458
+ const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
1459
+ const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
1460
+ const dim3 block_nums(block_num_x, nrows_x, 1);
1461
+ diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
1462
+ }
1463
+
1464
+ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
1465
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1466
+ const dim3 block_nums(1, nrows_x, 1);
1467
+ soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
1468
+ }
1469
+
953
1470
  // buffer pool for cuda
954
1471
  #define MAX_CUDA_BUFFERS 256
955
1472
 
@@ -1018,19 +1535,13 @@ static void * g_scratch_buffer = nullptr;
1018
1535
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
1019
1536
  static size_t g_scratch_offset = 0;
1020
1537
 
1021
- #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1022
- #define GGML_CUDA_MAX_EVENTS 64
1023
-
1024
1538
  static int g_device_count = -1;
1025
1539
  static int g_main_device = 0;
1026
1540
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1027
1541
 
1028
1542
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
1029
1543
 
1030
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1031
-
1032
- static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1033
- static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1544
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
1034
1545
 
1035
1546
  void ggml_init_cublas() {
1036
1547
  static bool initialized = false;
@@ -1054,15 +1565,8 @@ void ggml_init_cublas() {
1054
1565
  for (int id = 0; id < g_device_count; ++id) {
1055
1566
  CUDA_CHECK(cudaSetDevice(id));
1056
1567
 
1057
- // create streams
1058
- for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
1059
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1060
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1061
- }
1062
- // create events
1063
- for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
1064
- CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1065
- }
1568
+ // create main stream
1569
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
1066
1570
 
1067
1571
  // create cublas handle
1068
1572
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -1105,6 +1609,9 @@ void * ggml_cuda_host_malloc(size_t size) {
1105
1609
  void * ptr = nullptr;
1106
1610
  cudaError_t err = cudaMallocHost((void **) &ptr, size);
1107
1611
  if (err != cudaSuccess) {
1612
+ // The allocation error can be bypassed. A null ptr will assigned out of this function.
1613
+ // This can fixed the OOM error in WSL.
1614
+ cudaGetLastError();
1108
1615
  fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
1109
1616
  size/1024.0/1024.0, cudaGetErrorString(err));
1110
1617
  return nullptr;
@@ -1117,10 +1624,25 @@ void ggml_cuda_host_free(void * ptr) {
1117
1624
  CUDA_CHECK(cudaFreeHost(ptr));
1118
1625
  }
1119
1626
 
1120
- static cudaError_t ggml_cuda_h2d_tensor_2d(
1627
+ static cudaError_t ggml_cuda_cpy_tensor_2d(
1121
1628
  void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1122
1629
 
1123
- char * dst_char = (char *) dst;
1630
+ cudaMemcpyKind kind;
1631
+ char * src_ptr;
1632
+ if (src->backend == GGML_BACKEND_CPU) {
1633
+ kind = cudaMemcpyHostToDevice;
1634
+ src_ptr = (char *) src->data;
1635
+ } else if (src->backend == GGML_BACKEND_GPU) {
1636
+ kind = cudaMemcpyDeviceToDevice;
1637
+ struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
1638
+ int id;
1639
+ CUDA_CHECK(cudaGetDevice(&id));
1640
+ src_ptr = (char *) extra->data_device[id];
1641
+ } else {
1642
+ GGML_ASSERT(false);
1643
+ }
1644
+ char * dst_ptr = (char *) dst;
1645
+
1124
1646
  const int64_t ne0 = src->ne[0];
1125
1647
  const int64_t nb0 = src->nb[0];
1126
1648
  const int64_t nb1 = src->nb[1];
@@ -1131,17 +1653,17 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(
1131
1653
  const int64_t bs = ggml_blck_size(type);
1132
1654
  int64_t i1_diff = i1_high - i1_low;
1133
1655
 
1134
- const void * x = (const void *) ((const char *) src->data + i1_low*nb1 + i2*nb2 + i3*nb3);
1656
+ const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
1135
1657
  if (nb0 == ts && nb1 == ts*ne0/bs) {
1136
- return cudaMemcpyAsync(dst_char, x, i1_diff*nb1, cudaMemcpyHostToDevice, stream);
1658
+ return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
1137
1659
  } else if (nb0 == ts) {
1138
- return cudaMemcpy2DAsync(dst_char, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyHostToDevice, stream);
1660
+ return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
1139
1661
  } else {
1140
1662
  for (int64_t i1 = 0; i1 < i1_diff; i1++) {
1141
1663
  const void * rx = (const void *) ((const char *) x + i1*nb1);
1142
- void * rd = (void *) (dst_char + i1*ts*ne0/bs);
1664
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
1143
1665
  // pretend the row is a matrix with cols=1
1144
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
1666
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
1145
1667
  if (r != cudaSuccess) return r;
1146
1668
  }
1147
1669
  return cudaSuccess;
@@ -1260,21 +1782,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1260
1782
  const int64_t ne00 = src0->ne[0];
1261
1783
  const int64_t nrows = i01_high - i01_low;
1262
1784
 
1785
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
1786
+ #ifdef GGML_CUDA_DMMV_F16
1787
+ size_t ash;
1788
+ dfloat * src1_dfloat = nullptr; // dfloat == half
1789
+
1790
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
1791
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
1792
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
1793
+
1794
+ if (src1_convert_f16) {
1795
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
1796
+ ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
1797
+ ne00, 1, sizeof(float), 0, 0,
1798
+ ne00, 1, sizeof(half), 0, 0, cudaStream_main);
1799
+ }
1800
+ #else
1801
+ dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
1802
+ #endif // GGML_CUDA_DMMV_F16
1803
+
1263
1804
  switch (src0->type) {
1264
1805
  case GGML_TYPE_Q4_0:
1265
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1806
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1266
1807
  break;
1267
1808
  case GGML_TYPE_Q4_1:
1268
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1809
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1269
1810
  break;
1270
1811
  case GGML_TYPE_Q5_0:
1271
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1812
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1272
1813
  break;
1273
1814
  case GGML_TYPE_Q5_1:
1274
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1815
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1275
1816
  break;
1276
1817
  case GGML_TYPE_Q8_0:
1277
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1818
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1278
1819
  break;
1279
1820
  case GGML_TYPE_Q2_K:
1280
1821
  dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
@@ -1292,7 +1833,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1292
1833
  dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1293
1834
  break;
1294
1835
  case GGML_TYPE_F16:
1295
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1836
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1296
1837
  break;
1297
1838
  default:
1298
1839
  GGML_ASSERT(false);
@@ -1300,6 +1841,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1300
1841
  }
1301
1842
  CUDA_CHECK(cudaGetLastError());
1302
1843
 
1844
+ #ifdef GGML_CUDA_DMMV_F16
1845
+ if (src1_convert_f16) {
1846
+ ggml_cuda_pool_free(src1_dfloat, ash);
1847
+ }
1848
+ #endif // GGML_CUDA_DMMV_F16
1849
+
1303
1850
  (void) src1;
1304
1851
  (void) dst;
1305
1852
  (void) src0_ddf_i;
@@ -1377,8 +1924,81 @@ inline void ggml_cuda_op_rope(
1377
1924
  (void) i1;
1378
1925
  }
1379
1926
 
1927
+ inline void ggml_cuda_op_diag_mask_inf(
1928
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1929
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1930
+ cudaStream_t & cudaStream_main){
1931
+
1932
+ GGML_ASSERT(src0_ddf_i != nullptr);
1933
+ GGML_ASSERT(dst_ddf_i != nullptr);
1934
+
1935
+ const int64_t ne00 = src0->ne[0];
1936
+ const int64_t ne01 = src0->ne[1];
1937
+ const int64_t i01_diff = i01_high - i01_low;
1938
+
1939
+ const int n_past = ((int32_t *) src1->data)[0];
1940
+
1941
+ // compute
1942
+ diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
1943
+ CUDA_CHECK(cudaGetLastError());
1944
+
1945
+ (void) dst;
1946
+ (void) src0_ddq_i;
1947
+ (void) src1_ddf_i;
1948
+ (void) i02;
1949
+ (void) i1;
1950
+ }
1951
+
1952
+ inline void ggml_cuda_op_soft_max(
1953
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1954
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1955
+ cudaStream_t & cudaStream_main){
1956
+
1957
+ GGML_ASSERT(src0_ddf_i != nullptr);
1958
+ GGML_ASSERT(dst_ddf_i != nullptr);
1959
+
1960
+ const int64_t ne00 = src0->ne[0];
1961
+ const int64_t i01_diff = i01_high - i01_low;
1962
+
1963
+ // compute
1964
+ soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
1965
+ CUDA_CHECK(cudaGetLastError());
1966
+
1967
+ (void) src1;
1968
+ (void) dst;
1969
+ (void) src0_ddq_i;
1970
+ (void) src1_ddf_i;
1971
+ (void) i02;
1972
+ (void) i1;
1973
+ }
1974
+
1975
+ inline void ggml_cuda_op_scale(
1976
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1977
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1978
+ cudaStream_t & cudaStream_main){
1979
+
1980
+ GGML_ASSERT(src0_ddf_i != nullptr);
1981
+ GGML_ASSERT(dst_ddf_i != nullptr);
1982
+
1983
+ const float scale = ((float *) src1->data)[0];
1984
+
1985
+ const int64_t ne00 = src0->ne[0];
1986
+ const int64_t i01_diff = i01_high - i01_low;
1987
+
1988
+ // compute
1989
+ scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
1990
+ CUDA_CHECK(cudaGetLastError());
1991
+
1992
+ (void) src1;
1993
+ (void) dst;
1994
+ (void) src0_ddq_i;
1995
+ (void) src1_ddf_i;
1996
+ (void) i02;
1997
+ (void) i1;
1998
+ }
1999
+
1380
2000
  static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
1381
- ggml_cuda_op_t op, bool src0_needs_f32) {
2001
+ ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
1382
2002
  const int64_t ne00 = src0->ne[0];
1383
2003
  const int64_t ne01 = src0->ne[1];
1384
2004
  const int64_t ne02 = src0->ne[2];
@@ -1401,21 +2021,27 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1401
2021
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
1402
2022
 
1403
2023
  // strides for iteration over dims 3 and 2
1404
- const int64_t src0_stride = ne00 * ne01;
1405
- const int64_t src1_stride = ne10 * ne11;
1406
- const int64_t dst_stride = ne0 * ne1;
1407
- const int64_t num_iters = ne02 * ne03;
2024
+ const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
2025
+ const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
2026
+ const int64_t src0_stride = ne00 * ne01 * stride_mod;
2027
+ const int64_t src1_stride = ne10 * ne11 * stride_mod;
2028
+ const int64_t dst_stride = ne0 * ne1 * stride_mod;
1408
2029
 
1409
2030
  const size_t src0_ts = ggml_type_size(src0->type);
1410
2031
  const size_t src0_bs = ggml_blck_size(src0->type);
1411
2032
 
1412
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2033
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
1413
2034
  struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
1414
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2035
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
1415
2036
 
1416
2037
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
2038
+ const bool src0_is_contiguous = ggml_is_contiguous(src0);
1417
2039
  const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
1418
2040
 
2041
+ const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
2042
+ const bool src1_stays_on_host = use_src1 && (
2043
+ dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
2044
+
1419
2045
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
1420
2046
 
1421
2047
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
@@ -1424,13 +2050,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1424
2050
  char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
1425
2051
  float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
1426
2052
  float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
1427
- float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
2053
+ float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
1428
2054
 
1429
2055
  // asq = actual size quantized, asf = actual size float
1430
2056
  size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
1431
2057
  size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
1432
2058
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
1433
- size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
2059
+ size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
2060
+
2061
+ // if multiple GPUs are used they need to wait for the main GPU to finish
2062
+ if (split && g_device_count > 1) {
2063
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2064
+ CUDA_CHECK(cudaDeviceSynchronize());
2065
+ }
1434
2066
 
1435
2067
  for (int id = 0; id < g_device_count; ++id) {
1436
2068
  if (!split && id != g_main_device) {
@@ -1443,9 +2075,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1443
2075
  int64_t row_low, row_high;
1444
2076
  if (split) {
1445
2077
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
1446
- row_low -= row_low % GGML_CUDA_DMMV_Y;
1447
2078
  row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
1448
- row_high -= row_high % GGML_CUDA_DMMV_Y;
1449
2079
  } else {
1450
2080
  row_low = 0;
1451
2081
  row_high = nrows0;
@@ -1458,7 +2088,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1458
2088
 
1459
2089
  cudaSetDevice(id);
1460
2090
 
1461
- if (src0_on_device) {
2091
+ if (src0_on_device && src0_is_contiguous) {
1462
2092
  if (src0_is_f32) {
1463
2093
  src0_ddf[id] = (float *) src0_extra->data_device[id];
1464
2094
  } else {
@@ -1476,8 +2106,8 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1476
2106
  src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
1477
2107
  }
1478
2108
 
1479
- if (use_src1) {
1480
- if (src1_on_device) {
2109
+ if (use_src1 && !src1_stays_on_host) {
2110
+ if (src1_on_device && src1_is_contiguous) {
1481
2111
  src1_ddf[id] = (float *) src1_extra->data_device[id];
1482
2112
  } else {
1483
2113
  src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
@@ -1490,26 +2120,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1490
2120
  dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
1491
2121
  }
1492
2122
 
1493
- for (int64_t i03 = 0; i03 < ne03; i03++) {
2123
+ const int64_t i03_max = flatten_rows ? 1 : ne03;
2124
+ const int64_t i02_max = flatten_rows ? 1 : ne02;
2125
+ const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
2126
+
2127
+ for (int64_t i03 = 0; i03 < i03_max; i03++) {
1494
2128
  const int64_t i13 = i03 % ne13;
1495
- for (int64_t i02 = 0; i02 < ne02; i02++) {
2129
+ for (int64_t i02 = 0; i02 < i02_max; i02++) {
1496
2130
  const int64_t i12 = i02 % ne12;
1497
2131
 
1498
2132
  const int64_t i0 = i03*ne02 + i02;
1499
- const int64_t i0_offset_low = row_low/ne01;
1500
- const int64_t i0_offset_high = row_high/ne01;
2133
+
2134
+ // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
2135
+ const int64_t i0_offset_low = row_low/rows_per_iter;
2136
+ const int64_t i0_offset_high = row_high/rows_per_iter;
1501
2137
 
1502
2138
  int64_t i01_low = 0;
1503
- int64_t i01_high = ne01;
2139
+ int64_t i01_high = rows_per_iter;
1504
2140
  if (split) {
1505
2141
  if (i0 < i0_offset_low || i0 > i0_offset_high) {
1506
2142
  continue;
1507
2143
  }
1508
2144
  if (i0 == i0_offset_low) {
1509
- i01_low = row_low % ne01;
2145
+ i01_low = row_low % rows_per_iter;
1510
2146
  }
1511
2147
  if (i0 == i0_offset_high) {
1512
- i01_high = row_high % ne01;
2148
+ i01_high = row_high % rows_per_iter;
1513
2149
  }
1514
2150
  }
1515
2151
 
@@ -1518,7 +2154,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1518
2154
  // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
1519
2155
  // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
1520
2156
  GGML_ASSERT(i01_low == 0 || g_device_count > 1);
1521
- GGML_ASSERT(i01_high == ne01 || g_device_count > 1);
2157
+ GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
1522
2158
 
1523
2159
  const int64_t i01_diff = i01_high - i01_low;
1524
2160
  if (i01_diff == 0) {
@@ -1526,24 +2162,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1526
2162
  }
1527
2163
  const int64_t i11 = i13*ne12 + i12;
1528
2164
 
1529
- cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
1530
- cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
1531
- cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2165
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
1532
2166
 
1533
2167
  // for split tensors the data begins at i0 == i0_offset_low
1534
2168
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
1535
2169
  float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
1536
2170
  float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
1537
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
2171
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
1538
2172
 
1539
2173
  // for split tensors the data pointer needs to be rounded down
1540
2174
  // to the bin edge for i03, i02 bins beyond the first
1541
2175
  if (i0 - i0_offset_low > 0) {
2176
+ GGML_ASSERT(!flatten_rows);
1542
2177
  src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
1543
2178
  src0_ddf_i -= (row_low % ne01)*ne00;
1544
- }
1545
- if (i0 - i0_offset_low > 0) {
1546
- dst_ddf_i -= (row_low % ne0)*ne1;
2179
+ dst_ddf_i -= (row_low % ne0)*ne1;
1547
2180
  }
1548
2181
 
1549
2182
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
@@ -1553,38 +2186,41 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1553
2186
  }
1554
2187
 
1555
2188
  // copy src0, src1 to device if necessary
1556
- if (use_src1) {
2189
+ if (use_src1 && !src1_stays_on_host) {
1557
2190
  if (src1->backend == GGML_BACKEND_CPU) {
1558
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_memcpy_src1));
1559
- } else if (src1->backend == GGML_BACKEND_GPU) {
2191
+ GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
2192
+ int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2193
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
2194
+ } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
1560
2195
  if (id != g_main_device) {
2196
+ GGML_ASSERT(!flatten_rows);
1561
2197
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
1562
2198
  src1_ddf_i_source += i11*src1_stride;
1563
2199
  CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
1564
- cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
2200
+ cudaMemcpyDeviceToDevice, cudaStream_main));
1565
2201
  }
2202
+ } else if (src1_on_device && !src1_is_contiguous) {
2203
+ GGML_ASSERT(!split);
2204
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
1566
2205
  } else {
1567
2206
  GGML_ASSERT(false);
1568
2207
  }
1569
2208
  }
1570
- CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
1571
- if (!src0_on_device) {
2209
+
2210
+ if (!src0_on_device || !src0_is_contiguous) {
1572
2211
  if (src0_is_f32) {
1573
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
2212
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
1574
2213
  } else {
1575
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
2214
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
1576
2215
  }
1577
2216
  }
1578
2217
 
1579
- // convert src0 to f32 if it's necessary for the ggml_cuda_op
2218
+ // convert src0 to f32 if it is necessary for the ggml_cuda_op
1580
2219
  if (src0_needs_f32 && !src0_is_f32) {
1581
2220
  to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
1582
2221
  CUDA_CHECK(cudaGetLastError());
1583
2222
  }
1584
2223
 
1585
- // wait with main stream until src1 memcpy is done
1586
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
1587
-
1588
2224
  // do the computation
1589
2225
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
1590
2226
 
@@ -1622,8 +2258,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1622
2258
 
1623
2259
  // wait until each device is finished, then free their buffers
1624
2260
  for (int id = 0; id < g_device_count; ++id) {
2261
+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
2262
+ continue;
2263
+ }
2264
+
1625
2265
  CUDA_CHECK(cudaSetDevice(id));
1626
2266
  CUDA_CHECK(cudaDeviceSynchronize());
2267
+
1627
2268
  if (src0_asq[id] > 0) {
1628
2269
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
1629
2270
  }
@@ -1641,39 +2282,30 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1641
2282
 
1642
2283
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1643
2284
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1644
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true);
2285
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
1645
2286
  }
1646
2287
 
1647
2288
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1648
2289
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1649
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true);
2290
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
1650
2291
  }
1651
2292
 
1652
2293
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1653
2294
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1654
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true);
2295
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
1655
2296
  }
1656
2297
 
1657
2298
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1658
2299
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1659
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true);
2300
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
1660
2301
  }
1661
2302
 
1662
2303
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1663
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
1664
2304
  const int64_t ne10 = src1->ne[0];
1665
2305
 
1666
2306
  const int64_t ne0 = dst->ne[0];
1667
2307
  const int64_t ne1 = dst->ne[1];
1668
2308
 
1669
- // if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) {
1670
- // fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n",
1671
- // src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
1672
- // src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
1673
- // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
1674
- // return false;
1675
- // }
1676
-
1677
2309
  // TODO: find the optimal values for these
1678
2310
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
1679
2311
  src1->type == GGML_TYPE_F32 &&
@@ -1685,23 +2317,152 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
1685
2317
  return false;
1686
2318
  }
1687
2319
 
2320
+ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
2321
+ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
2322
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
2323
+ GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
2324
+ GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
2325
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
2326
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
2327
+
2328
+ const int64_t ne00 = src0->ne[0];
2329
+ const int64_t ne01 = src0->ne[1];
2330
+ const int64_t ne02 = src0->ne[2];
2331
+
2332
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2333
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2334
+
2335
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2336
+ void * src0_ddq = src0_extra->data_device[g_main_device];
2337
+
2338
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2339
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
2340
+
2341
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2342
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2343
+
2344
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2345
+ }
2346
+
2347
+ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
2348
+ GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
2349
+ GGML_ASSERT(!ggml_is_permuted(src0));
2350
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
2351
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
2352
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
2353
+
2354
+ const int64_t ne00 = src0->ne[0];
2355
+ const int64_t ne01 = src0->ne[1];
2356
+ const int64_t ne02 = src0->ne[2];
2357
+
2358
+ const int64_t nb01 = src0->nb[1];
2359
+ const int64_t nb02 = src0->nb[2];
2360
+
2361
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2362
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2363
+
2364
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2365
+ void * src0_ddq = src0_extra->data_device[g_main_device];
2366
+
2367
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2368
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
2369
+
2370
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2371
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2372
+
2373
+ const int row_stride_x = nb01 / sizeof(half);
2374
+ const int channel_stride_x = nb02 / sizeof(half);
2375
+
2376
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2377
+ }
2378
+
1688
2379
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1689
- if (src0->type == GGML_TYPE_F32) {
1690
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
2380
+ bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
2381
+ src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
2382
+
2383
+ if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
2384
+ ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
2385
+ } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
2386
+ ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
2387
+ }else if (src0->type == GGML_TYPE_F32) {
2388
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
1691
2389
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
1692
- if (src1->ne[1] == 1) {
1693
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
2390
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
2391
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
1694
2392
  } else {
1695
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
2393
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
1696
2394
  }
1697
2395
  } else {
1698
2396
  GGML_ASSERT(false);
1699
2397
  }
1700
2398
  }
1701
2399
 
2400
+ void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2401
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2402
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
2403
+ }
2404
+
2405
+ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2406
+ const int64_t ne = ggml_nelements(src0);
2407
+ GGML_ASSERT(ne == ggml_nelements(src1));
2408
+
2409
+ GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2410
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
2411
+
2412
+ GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
2413
+ GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
2414
+
2415
+ const int64_t ne00 = src0->ne[0];
2416
+ const int64_t ne01 = src0->ne[1];
2417
+ GGML_ASSERT(src0->ne[3] == 1);
2418
+
2419
+ const int64_t nb00 = src0->nb[0];
2420
+ const int64_t nb01 = src0->nb[1];
2421
+ const int64_t nb02 = src0->nb[2];
2422
+
2423
+ const int64_t ne10 = src1->ne[0];
2424
+ const int64_t ne11 = src1->ne[1];
2425
+ GGML_ASSERT(src1->ne[3] == 1);
2426
+
2427
+ const int64_t nb10 = src1->nb[0];
2428
+ const int64_t nb11 = src1->nb[1];
2429
+ const int64_t nb12 = src1->nb[2];
2430
+
2431
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2432
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2433
+
2434
+ const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2435
+ const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2436
+
2437
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2438
+ char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
2439
+
2440
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
2441
+ ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
2442
+ ne10, ne11, nb10, nb11, nb12, cudaStream_main);
2443
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
2444
+ ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
2445
+ ne10, ne11, nb10, nb11, nb12, cudaStream_main);
2446
+ } else {
2447
+ GGML_ASSERT(false);
2448
+ }
2449
+
2450
+ (void) dst;
2451
+ }
2452
+
2453
+ void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2454
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2455
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
2456
+ }
2457
+
2458
+ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2459
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2460
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
2461
+ }
2462
+
1702
2463
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1703
2464
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1704
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
2465
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
1705
2466
  }
1706
2467
 
1707
2468
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1710,16 +2471,14 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
1710
2471
  (void) dst;
1711
2472
  }
1712
2473
 
1713
- void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
1714
- FILE * fp = fopen(fname, "rb");
2474
+ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
1715
2475
  int nrows = ggml_nrows(tensor);
1716
2476
  const size_t nb1 = tensor->nb[1];
1717
2477
  ggml_backend backend = tensor->backend;
1718
2478
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
2479
+ memset(extra, 0, sizeof(*extra));
1719
2480
 
1720
2481
  for (int id = 0; id < g_device_count; ++id) {
1721
- extra->data_device[id] = nullptr;
1722
-
1723
2482
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
1724
2483
  continue;
1725
2484
  }
@@ -1732,10 +2491,7 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
1732
2491
  row_high = nrows;
1733
2492
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
1734
2493
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
1735
- row_low -= row_low % GGML_CUDA_DMMV_Y;
1736
2494
  row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
1737
- row_high -= row_high % GGML_CUDA_DMMV_Y;
1738
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1739
2495
  } else {
1740
2496
  GGML_ASSERT(false);
1741
2497
  }
@@ -1745,35 +2501,19 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
1745
2501
 
1746
2502
  int64_t nrows_split = row_high - row_low;
1747
2503
 
1748
- const size_t offset_split = offset + row_low*nb1;
2504
+ const size_t offset_split = row_low*nb1;
1749
2505
  const size_t size = ggml_nbytes_split(tensor, nrows_split);
1750
2506
 
1751
2507
  void * buf;
1752
2508
  CUDA_CHECK(cudaMalloc(&buf, size));
1753
- void * buf_host = malloc(size);
1754
-
1755
- #ifdef _WIN32
1756
- int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
1757
- #else
1758
- int ret = fseek(fp, (long) offset_split, SEEK_SET);
1759
- #endif
1760
- GGML_ASSERT(ret == 0); // same
1761
-
1762
- size_t ret2 = fread(buf_host, size, 1, fp);
1763
- if (ret2 != 1) {
1764
- fprintf(stderr, "unexpectedly reached end of file");
1765
- exit(1);
1766
- }
2509
+ void * buf_host = (char*)data + offset_split;
1767
2510
 
1768
2511
  cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
1769
- cudaDeviceSynchronize();
1770
2512
 
1771
- free(buf_host);
1772
2513
  extra->data_device[id] = buf;
1773
2514
  }
1774
2515
 
1775
2516
  tensor->extra = extra;
1776
- fclose(fp);
1777
2517
  }
1778
2518
 
1779
2519
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
@@ -1795,47 +2535,78 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
1795
2535
  delete extra;
1796
2536
  }
1797
2537
 
1798
- void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
1799
- if (tensor->src0 != nullptr && tensor->src0->op == GGML_OP_RESHAPE) {
1800
- ggml_cuda_assign_buffers(tensor);
2538
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2539
+ if (scratch && g_scratch_size == 0) {
2540
+ return;
1801
2541
  }
1802
2542
 
1803
- const size_t size = ggml_nbytes(tensor);
1804
- GGML_ASSERT(size <= g_scratch_size);
1805
- if (g_scratch_offset + size > g_scratch_size) {
1806
- g_scratch_offset = 0;
2543
+ // recursively assign CUDA buffers until a compute tensor is found
2544
+ if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2545
+ const ggml_op src0_op = tensor->src0->op;
2546
+ if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2547
+ ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
2548
+ }
2549
+ }
2550
+ if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2551
+ ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
1807
2552
  }
1808
2553
 
1809
2554
  tensor->backend = GGML_BACKEND_GPU;
1810
2555
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
1811
2556
 
1812
- bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data;
2557
+ const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2558
+ tensor->op == GGML_OP_VIEW;
2559
+ const size_t size = ggml_nbytes(tensor);
1813
2560
 
1814
2561
  CUDA_CHECK(cudaSetDevice(g_main_device));
1815
2562
  if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
1816
2563
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
1817
- extra->data_device[g_main_device] = src0_extra->data_device;
1818
- GGML_ASSERT(false);
1819
- } else {
2564
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2565
+ size_t offset = 0;
2566
+ if (tensor->op == GGML_OP_VIEW) {
2567
+ memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
2568
+ }
2569
+ extra->data_device[g_main_device] = src0_ddc + offset;
2570
+ } else if (tensor->op == GGML_OP_CPY) {
2571
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
2572
+ void * src1_ddv = src1_extra->data_device[g_main_device];
2573
+ extra->data_device[g_main_device] = src1_ddv;
2574
+ } else if (scratch) {
2575
+ GGML_ASSERT(size <= g_scratch_size);
2576
+ if (g_scratch_offset + size > g_scratch_size) {
2577
+ g_scratch_offset = 0;
2578
+ }
2579
+
1820
2580
  char * data = (char *) g_scratch_buffer;
1821
2581
  if (data == nullptr) {
1822
2582
  CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
1823
2583
  g_scratch_buffer = data;
1824
2584
  }
1825
2585
  extra->data_device[g_main_device] = data + g_scratch_offset;
1826
- }
1827
2586
 
1828
- // fprintf(stderr, "data=%p offset=%ld data_device=%p\n", data, g_scratch_offset, extra->data_device[0]);
1829
- g_scratch_offset += size;
1830
- // fprintf(stderr, "%s: scratch %d, %p - %p\n",
1831
- // tensor->name, g_scratch_index, data + g_scratch_offset, data + g_scratch_offset + size);
2587
+ g_scratch_offset += size;
2588
+
2589
+ GGML_ASSERT(g_scratch_offset <= g_scratch_size);
2590
+ } else { // allocate new buffers outside of scratch
2591
+ void * data;
2592
+ CUDA_CHECK(cudaMalloc(&data, size));
2593
+ CUDA_CHECK(cudaMemset(data, 0, size));
2594
+ extra->data_device[g_main_device] = data;
2595
+ }
1832
2596
 
1833
- GGML_ASSERT(g_scratch_offset <= g_scratch_size);
1834
2597
  tensor->extra = extra;
1835
2598
  }
1836
2599
 
2600
+ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2601
+ ggml_cuda_assign_buffers_impl(tensor, true);
2602
+ }
2603
+
2604
+ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2605
+ ggml_cuda_assign_buffers_impl(tensor, false);
2606
+ }
2607
+
1837
2608
  void ggml_cuda_set_main_device(int main_device) {
1838
- if (main_device > g_device_count) {
2609
+ if (main_device >= g_device_count) {
1839
2610
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
1840
2611
  main_device, g_device_count, g_main_device);
1841
2612
  return;
@@ -1852,6 +2623,15 @@ void ggml_cuda_set_scratch_size(size_t scratch_size) {
1852
2623
  g_scratch_size = scratch_size;
1853
2624
  }
1854
2625
 
2626
+ void ggml_cuda_free_scratch() {
2627
+ if (g_scratch_buffer == nullptr) {
2628
+ return;
2629
+ }
2630
+
2631
+ CUDA_CHECK(cudaFree(g_scratch_buffer));
2632
+ g_scratch_buffer = nullptr;
2633
+ }
2634
+
1855
2635
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
1856
2636
  ggml_cuda_func_t func;
1857
2637
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -1889,12 +2669,39 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
1889
2669
  }
1890
2670
  func = ggml_cuda_mul_mat;
1891
2671
  break;
2672
+ case GGML_OP_SCALE:
2673
+ if (!any_on_device) {
2674
+ return false;
2675
+ }
2676
+ func = ggml_cuda_scale;
2677
+ break;
2678
+ case GGML_OP_CPY:
2679
+ if (!any_on_device) {
2680
+ return false;
2681
+ }
2682
+ func = ggml_cuda_cpy;
2683
+ break;
1892
2684
  case GGML_OP_RESHAPE:
2685
+ case GGML_OP_VIEW:
2686
+ case GGML_OP_PERMUTE:
2687
+ case GGML_OP_TRANSPOSE:
1893
2688
  if (!any_on_device) {
1894
2689
  return false;
1895
2690
  }
1896
2691
  func = ggml_cuda_nop;
1897
2692
  break;
2693
+ case GGML_OP_DIAG_MASK_INF:
2694
+ if (!any_on_device) {
2695
+ return false;
2696
+ }
2697
+ func = ggml_cuda_diag_mask_inf;
2698
+ break;
2699
+ case GGML_OP_SOFT_MAX:
2700
+ if (!any_on_device) {
2701
+ return false;
2702
+ }
2703
+ func = ggml_cuda_soft_max;
2704
+ break;
1898
2705
  case GGML_OP_ROPE:
1899
2706
  if (!any_on_device) {
1900
2707
  return false;