llama_cpp 0.8.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -29,6 +29,8 @@
29
29
  #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
30
30
  #define cublasCreate hipblasCreate
31
31
  #define cublasGemmEx hipblasGemmEx
32
+ #define cublasGemmBatchedEx hipblasGemmBatchedEx
33
+ #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
32
34
  #define cublasHandle_t hipblasHandle_t
33
35
  #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
34
36
  #define cublasSetStream hipblasSetStream
@@ -85,6 +87,24 @@
85
87
  #define CC_OFFSET_AMD 1000000
86
88
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
87
89
 
90
+ // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
+ // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
+ // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
93
+ // - 7B quantum model: +100-200 MB
94
+ // - 13B quantum model: +200-400 MB
95
+ //
96
+ //#define GGML_CUDA_FORCE_MMQ
97
+
98
+ // TODO: improve this to be correct for more hardware
99
+ // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
100
+ // probably other such cases, and not sure what happens on AMD hardware
101
+ #if !defined(GGML_CUDA_FORCE_MMQ)
102
+ #define CUDA_USE_TENSOR_CORES
103
+ #endif
104
+
105
+ // max batch size to use MMQ kernels when tensor cores are available
106
+ #define MMQ_MAX_BATCH_SIZE 32
107
+
88
108
  #if defined(GGML_USE_HIPBLAS)
89
109
  #define __CUDA_ARCH__ 1300
90
110
 
@@ -468,7 +488,6 @@ static int g_device_count = -1;
468
488
  static int g_main_device = 0;
469
489
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
470
490
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
471
- static bool g_mul_mat_q = true;
472
491
 
473
492
  static void * g_scratch_buffer = nullptr;
474
493
  static size_t g_scratch_size = 0; // disabled by default
@@ -494,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
494
513
  dst[i] = __hadd(x[i], __float2half(y[i]));
495
514
  }
496
515
 
516
+ static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
517
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
518
+
519
+ if (i >= k) {
520
+ return;
521
+ }
522
+ dst[i] = __half2float(x[i]) + y[i];
523
+ }
524
+
497
525
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
498
526
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
499
527
 
@@ -3552,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
3552
3580
  #define MMQ_X_Q4_0_RDNA1 64
3553
3581
  #define MMQ_Y_Q4_0_RDNA1 64
3554
3582
  #define NWARPS_Q4_0_RDNA1 8
3583
+ #if defined(CUDA_USE_TENSOR_CORES)
3584
+ #define MMQ_X_Q4_0_AMPERE 4
3585
+ #define MMQ_Y_Q4_0_AMPERE 32
3586
+ #define NWARPS_Q4_0_AMPERE 4
3587
+ #else
3555
3588
  #define MMQ_X_Q4_0_AMPERE 64
3556
3589
  #define MMQ_Y_Q4_0_AMPERE 128
3557
3590
  #define NWARPS_Q4_0_AMPERE 4
3591
+ #endif
3558
3592
  #define MMQ_X_Q4_0_PASCAL 64
3559
3593
  #define MMQ_Y_Q4_0_PASCAL 64
3560
3594
  #define NWARPS_Q4_0_PASCAL 8
@@ -3613,9 +3647,15 @@ template <bool need_check> static __global__ void
3613
3647
  #define MMQ_X_Q4_1_RDNA1 64
3614
3648
  #define MMQ_Y_Q4_1_RDNA1 64
3615
3649
  #define NWARPS_Q4_1_RDNA1 8
3650
+ #if defined(CUDA_USE_TENSOR_CORES)
3651
+ #define MMQ_X_Q4_1_AMPERE 4
3652
+ #define MMQ_Y_Q4_1_AMPERE 32
3653
+ #define NWARPS_Q4_1_AMPERE 4
3654
+ #else
3616
3655
  #define MMQ_X_Q4_1_AMPERE 64
3617
3656
  #define MMQ_Y_Q4_1_AMPERE 128
3618
3657
  #define NWARPS_Q4_1_AMPERE 4
3658
+ #endif
3619
3659
  #define MMQ_X_Q4_1_PASCAL 64
3620
3660
  #define MMQ_Y_Q4_1_PASCAL 64
3621
3661
  #define NWARPS_Q4_1_PASCAL 8
@@ -3676,9 +3716,15 @@ template <bool need_check> static __global__ void
3676
3716
  #define MMQ_X_Q5_0_RDNA1 64
3677
3717
  #define MMQ_Y_Q5_0_RDNA1 64
3678
3718
  #define NWARPS_Q5_0_RDNA1 8
3719
+ #if defined(CUDA_USE_TENSOR_CORES)
3720
+ #define MMQ_X_Q5_0_AMPERE 4
3721
+ #define MMQ_Y_Q5_0_AMPERE 32
3722
+ #define NWARPS_Q5_0_AMPERE 4
3723
+ #else
3679
3724
  #define MMQ_X_Q5_0_AMPERE 128
3680
3725
  #define MMQ_Y_Q5_0_AMPERE 64
3681
3726
  #define NWARPS_Q5_0_AMPERE 4
3727
+ #endif
3682
3728
  #define MMQ_X_Q5_0_PASCAL 64
3683
3729
  #define MMQ_Y_Q5_0_PASCAL 64
3684
3730
  #define NWARPS_Q5_0_PASCAL 8
@@ -3737,9 +3783,15 @@ template <bool need_check> static __global__ void
3737
3783
  #define MMQ_X_Q5_1_RDNA1 64
3738
3784
  #define MMQ_Y_Q5_1_RDNA1 64
3739
3785
  #define NWARPS_Q5_1_RDNA1 8
3786
+ #if defined(CUDA_USE_TENSOR_CORES)
3787
+ #define MMQ_X_Q5_1_AMPERE 4
3788
+ #define MMQ_Y_Q5_1_AMPERE 32
3789
+ #define NWARPS_Q5_1_AMPERE 4
3790
+ #else
3740
3791
  #define MMQ_X_Q5_1_AMPERE 128
3741
3792
  #define MMQ_Y_Q5_1_AMPERE 64
3742
3793
  #define NWARPS_Q5_1_AMPERE 4
3794
+ #endif
3743
3795
  #define MMQ_X_Q5_1_PASCAL 64
3744
3796
  #define MMQ_Y_Q5_1_PASCAL 64
3745
3797
  #define NWARPS_Q5_1_PASCAL 8
@@ -3798,9 +3850,15 @@ mul_mat_q5_1(
3798
3850
  #define MMQ_X_Q8_0_RDNA1 64
3799
3851
  #define MMQ_Y_Q8_0_RDNA1 64
3800
3852
  #define NWARPS_Q8_0_RDNA1 8
3853
+ #if defined(CUDA_USE_TENSOR_CORES)
3854
+ #define MMQ_X_Q8_0_AMPERE 4
3855
+ #define MMQ_Y_Q8_0_AMPERE 32
3856
+ #define NWARPS_Q8_0_AMPERE 4
3857
+ #else
3801
3858
  #define MMQ_X_Q8_0_AMPERE 128
3802
3859
  #define MMQ_Y_Q8_0_AMPERE 64
3803
3860
  #define NWARPS_Q8_0_AMPERE 4
3861
+ #endif
3804
3862
  #define MMQ_X_Q8_0_PASCAL 64
3805
3863
  #define MMQ_Y_Q8_0_PASCAL 64
3806
3864
  #define NWARPS_Q8_0_PASCAL 8
@@ -3859,9 +3917,15 @@ template <bool need_check> static __global__ void
3859
3917
  #define MMQ_X_Q2_K_RDNA1 128
3860
3918
  #define MMQ_Y_Q2_K_RDNA1 32
3861
3919
  #define NWARPS_Q2_K_RDNA1 8
3920
+ #if defined(CUDA_USE_TENSOR_CORES)
3921
+ #define MMQ_X_Q2_K_AMPERE 4
3922
+ #define MMQ_Y_Q2_K_AMPERE 32
3923
+ #define NWARPS_Q2_K_AMPERE 4
3924
+ #else
3862
3925
  #define MMQ_X_Q2_K_AMPERE 64
3863
3926
  #define MMQ_Y_Q2_K_AMPERE 128
3864
3927
  #define NWARPS_Q2_K_AMPERE 4
3928
+ #endif
3865
3929
  #define MMQ_X_Q2_K_PASCAL 64
3866
3930
  #define MMQ_Y_Q2_K_PASCAL 64
3867
3931
  #define NWARPS_Q2_K_PASCAL 8
@@ -3920,9 +3984,15 @@ mul_mat_q2_K(
3920
3984
  #define MMQ_X_Q3_K_RDNA1 32
3921
3985
  #define MMQ_Y_Q3_K_RDNA1 128
3922
3986
  #define NWARPS_Q3_K_RDNA1 8
3987
+ #if defined(CUDA_USE_TENSOR_CORES)
3988
+ #define MMQ_X_Q3_K_AMPERE 4
3989
+ #define MMQ_Y_Q3_K_AMPERE 32
3990
+ #define NWARPS_Q3_K_AMPERE 4
3991
+ #else
3923
3992
  #define MMQ_X_Q3_K_AMPERE 128
3924
3993
  #define MMQ_Y_Q3_K_AMPERE 128
3925
3994
  #define NWARPS_Q3_K_AMPERE 4
3995
+ #endif
3926
3996
  #define MMQ_X_Q3_K_PASCAL 64
3927
3997
  #define MMQ_Y_Q3_K_PASCAL 64
3928
3998
  #define NWARPS_Q3_K_PASCAL 8
@@ -3983,9 +4053,15 @@ template <bool need_check> static __global__ void
3983
4053
  #define MMQ_X_Q4_K_RDNA1 32
3984
4054
  #define MMQ_Y_Q4_K_RDNA1 64
3985
4055
  #define NWARPS_Q4_K_RDNA1 8
4056
+ #if defined(CUDA_USE_TENSOR_CORES)
4057
+ #define MMQ_X_Q4_K_AMPERE 4
4058
+ #define MMQ_Y_Q4_K_AMPERE 32
4059
+ #define NWARPS_Q4_K_AMPERE 4
4060
+ #else
3986
4061
  #define MMQ_X_Q4_K_AMPERE 64
3987
4062
  #define MMQ_Y_Q4_K_AMPERE 128
3988
4063
  #define NWARPS_Q4_K_AMPERE 4
4064
+ #endif
3989
4065
  #define MMQ_X_Q4_K_PASCAL 64
3990
4066
  #define MMQ_Y_Q4_K_PASCAL 64
3991
4067
  #define NWARPS_Q4_K_PASCAL 8
@@ -4046,9 +4122,15 @@ template <bool need_check> static __global__ void
4046
4122
  #define MMQ_X_Q5_K_RDNA1 32
4047
4123
  #define MMQ_Y_Q5_K_RDNA1 64
4048
4124
  #define NWARPS_Q5_K_RDNA1 8
4125
+ #if defined(CUDA_USE_TENSOR_CORES)
4126
+ #define MMQ_X_Q5_K_AMPERE 4
4127
+ #define MMQ_Y_Q5_K_AMPERE 32
4128
+ #define NWARPS_Q5_K_AMPERE 4
4129
+ #else
4049
4130
  #define MMQ_X_Q5_K_AMPERE 64
4050
4131
  #define MMQ_Y_Q5_K_AMPERE 128
4051
4132
  #define NWARPS_Q5_K_AMPERE 4
4133
+ #endif
4052
4134
  #define MMQ_X_Q5_K_PASCAL 64
4053
4135
  #define MMQ_Y_Q5_K_PASCAL 64
4054
4136
  #define NWARPS_Q5_K_PASCAL 8
@@ -4107,9 +4189,15 @@ mul_mat_q5_K(
4107
4189
  #define MMQ_X_Q6_K_RDNA1 32
4108
4190
  #define MMQ_Y_Q6_K_RDNA1 64
4109
4191
  #define NWARPS_Q6_K_RDNA1 8
4192
+ #if defined(CUDA_USE_TENSOR_CORES)
4193
+ #define MMQ_X_Q6_K_AMPERE 4
4194
+ #define MMQ_Y_Q6_K_AMPERE 32
4195
+ #define NWARPS_Q6_K_AMPERE 4
4196
+ #else
4110
4197
  #define MMQ_X_Q6_K_AMPERE 64
4111
4198
  #define MMQ_Y_Q6_K_AMPERE 64
4112
4199
  #define NWARPS_Q6_K_AMPERE 4
4200
+ #endif
4113
4201
  #define MMQ_X_Q6_K_PASCAL 64
4114
4202
  #define MMQ_Y_Q6_K_PASCAL 64
4115
4203
  #define NWARPS_Q6_K_PASCAL 8
@@ -4326,13 +4414,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
4326
4414
 
4327
4415
  const half * x = (const half *) vx;
4328
4416
 
4329
- const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
4330
- const int channel = blockDim.z*blockIdx.z + threadIdx.z;
4417
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
4418
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
4331
4419
  const int channel_x = channel / channel_x_divisor;
4332
4420
 
4333
- const int nrows_y = ncols_x;
4421
+ const int nrows_y = ncols_x;
4334
4422
  const int nrows_dst = nrows_x;
4335
- const int row_dst = row_x;
4423
+ const int row_dst = row_x;
4336
4424
 
4337
4425
  const int idst = channel*nrows_dst + row_dst;
4338
4426
 
@@ -4345,13 +4433,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
4345
4433
  break;
4346
4434
  }
4347
4435
 
4348
- const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
4349
- const float xi = __half2float(x[ix]);
4350
-
4351
4436
  const int row_y = col_x;
4352
4437
 
4438
+ const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
4353
4439
  const int iy = channel*nrows_y + row_y;
4354
4440
 
4441
+ const float xi = __half2float(x[ix]);
4442
+
4355
4443
  tmp += xi * y[iy];
4356
4444
  }
4357
4445
 
@@ -4405,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4405
4493
  cpy_1(cx + x_offset, cdst + dst_offset);
4406
4494
  }
4407
4495
 
4408
- // rope == RoPE == rotary positional embedding
4496
+ static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4497
+ const float y = (i0 / 2 - low) / max(0.001f, high - low);
4498
+ return 1.0f - min(1.0f, max(0.0f, y));
4499
+ }
4500
+
4501
+ struct rope_corr_dims {
4502
+ float v[4];
4503
+ };
4504
+
4505
+ // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
4506
+ // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
4507
+ static __device__ void rope_yarn(
4508
+ float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
4509
+ float * cos_theta, float * sin_theta
4510
+ ) {
4511
+ // Get n-d rotational scaling corrected for extrapolation
4512
+ float theta_interp = freq_scale * theta_extrap;
4513
+ float theta = theta_interp;
4514
+ if (ext_factor != 0.0f) {
4515
+ float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
4516
+ theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
4517
+
4518
+ // Get n-d magnitude scaling corrected for interpolation
4519
+ mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
4520
+ }
4521
+ *cos_theta = cosf(theta) * mscale;
4522
+ *sin_theta = sinf(theta) * mscale;
4523
+ }
4409
4524
 
4525
+ // rope == RoPE == rotary positional embedding
4410
4526
  template<typename T, bool has_pos>
4411
- static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4412
- const int p_delta_rows, const float theta_scale) {
4527
+ static __global__ void rope(
4528
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4529
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4530
+ ) {
4413
4531
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4414
4532
 
4415
4533
  if (col >= ncols) {
@@ -4421,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4421
4539
  const int i2 = row/p_delta_rows;
4422
4540
 
4423
4541
  const int p = has_pos ? pos[i2] : 0;
4424
- const float p0 = p*freq_scale;
4425
- const float theta = p0*powf(theta_scale, col/2);
4426
- const float sin_theta = sinf(theta);
4427
- const float cos_theta = cosf(theta);
4542
+ const float theta_base = p*powf(freq_base, -float(col)/ncols);
4543
+
4544
+ float cos_theta, sin_theta;
4545
+ rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
4428
4546
 
4429
4547
  const float x0 = x[i + 0];
4430
4548
  const float x1 = x[i + 1];
@@ -4434,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4434
4552
  }
4435
4553
 
4436
4554
  template<typename T, bool has_pos>
4437
- static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4438
- const int p_delta_rows, const float theta_scale) {
4555
+ static __global__ void rope_neox(
4556
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4557
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4558
+ ) {
4439
4559
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4440
4560
 
4441
4561
  if (col >= ncols) {
@@ -4446,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4446
4566
  const int i = row*ncols + col/2;
4447
4567
  const int i2 = row/p_delta_rows;
4448
4568
 
4569
+ // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4570
+ const float cur_rot = -float(col)/ncols;
4571
+
4449
4572
  const int p = has_pos ? pos[i2] : 0;
4450
- const float p0 = p*freq_scale;
4451
- const float theta = p0*powf(theta_scale, col/2);
4452
- const float sin_theta = sinf(theta);
4453
- const float cos_theta = cosf(theta);
4573
+ const float theta_base = p*powf(freq_base, cur_rot);
4574
+
4575
+ float cos_theta, sin_theta;
4576
+ rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4454
4577
 
4455
4578
  const float x0 = x[i + 0];
4456
4579
  const float x1 = x[i + ncols/2];
@@ -4459,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4459
4582
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4460
4583
  }
4461
4584
 
4462
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
4463
- const int p_delta_rows, const float theta_scale, const int n_ctx) {
4585
+ static __global__ void rope_glm_f32(
4586
+ const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4587
+ int n_ctx
4588
+ ) {
4464
4589
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4465
4590
  const int half_n_dims = ncols/4;
4466
4591
 
@@ -4472,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4472
4597
  const int i = row*ncols + col;
4473
4598
  const int i2 = row/p_delta_rows;
4474
4599
 
4475
- const float col_theta_scale = powf(theta_scale, col);
4600
+ const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
4476
4601
  // FIXME: this is likely wrong
4477
4602
  const int p = pos != nullptr ? pos[i2] : 0;
4478
4603
 
@@ -4614,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
4614
4739
  add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4615
4740
  }
4616
4741
 
4742
+ static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4743
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4744
+ add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4745
+ }
4746
+
4617
4747
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4618
4748
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4619
4749
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5491,40 +5621,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
5491
5621
  }
5492
5622
 
5493
5623
  template<typename T>
5494
- static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5495
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5624
+ static void rope_cuda(
5625
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5626
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5627
+ ) {
5496
5628
  GGML_ASSERT(ncols % 2 == 0);
5497
5629
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5498
5630
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5499
5631
  const dim3 block_nums(nrows, num_blocks_x, 1);
5500
5632
  if (pos == nullptr) {
5501
- rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5633
+ rope<T, false><<<block_nums, block_dims, 0, stream>>>(
5634
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5635
+ );
5502
5636
  } else {
5503
- rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5637
+ rope<T, true><<<block_nums, block_dims, 0, stream>>>(
5638
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5639
+ );
5504
5640
  }
5505
5641
  }
5506
5642
 
5507
5643
  template<typename T>
5508
- static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5509
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5644
+ static void rope_neox_cuda(
5645
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5646
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5647
+ ) {
5510
5648
  GGML_ASSERT(ncols % 2 == 0);
5511
5649
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5512
5650
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5513
5651
  const dim3 block_nums(nrows, num_blocks_x, 1);
5514
5652
  if (pos == nullptr) {
5515
- rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5653
+ rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5654
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5655
+ );
5516
5656
  } else {
5517
- rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5657
+ rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5658
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5659
+ );
5518
5660
  }
5519
5661
  }
5520
5662
 
5521
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5522
- const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5663
+ static void rope_glm_f32_cuda(
5664
+ const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5665
+ float freq_base, int n_ctx, cudaStream_t stream
5666
+ ) {
5523
5667
  GGML_ASSERT(ncols % 4 == 0);
5524
5668
  const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5525
5669
  const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
5526
5670
  const dim3 block_nums(num_blocks_x, nrows, 1);
5527
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
5671
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
5528
5672
  }
5529
5673
 
5530
5674
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5661,11 +5805,21 @@ void ggml_init_cublas() {
5661
5805
  CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5662
5806
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5663
5807
  int64_t total_vram = 0;
5808
+ #if defined(GGML_CUDA_FORCE_MMQ)
5809
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
5810
+ #else
5811
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
5812
+ #endif
5813
+ #if defined(CUDA_USE_TENSOR_CORES)
5814
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
5815
+ #else
5816
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
5817
+ #endif
5664
5818
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5665
- for (int64_t id = 0; id < g_device_count; ++id) {
5819
+ for (int id = 0; id < g_device_count; ++id) {
5666
5820
  cudaDeviceProp prop;
5667
5821
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
5668
- fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5822
+ fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5669
5823
 
5670
5824
  g_tensor_split[id] = total_vram;
5671
5825
  total_vram += prop.totalGlobalMem;
@@ -5675,15 +5829,15 @@ void ggml_init_cublas() {
5675
5829
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
5676
5830
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5677
5831
  }
5678
- for (int64_t id = 0; id < g_device_count; ++id) {
5832
+ for (int id = 0; id < g_device_count; ++id) {
5679
5833
  g_tensor_split[id] /= total_vram;
5680
5834
  }
5681
5835
 
5682
- for (int64_t id = 0; id < g_device_count; ++id) {
5836
+ for (int id = 0; id < g_device_count; ++id) {
5683
5837
  CUDA_CHECK(ggml_cuda_set_device(id));
5684
5838
 
5685
5839
  // create cuda streams
5686
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
5840
+ for (int is = 0; is < MAX_STREAMS; ++is) {
5687
5841
  CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
5688
5842
  }
5689
5843
 
@@ -5907,7 +6061,10 @@ inline void ggml_cuda_op_add(
5907
6061
  add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5908
6062
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5909
6063
  add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6064
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6065
+ add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
5910
6066
  } else {
6067
+ fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
5911
6068
  GGML_ASSERT(false);
5912
6069
  }
5913
6070
 
@@ -6252,16 +6409,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
6252
6409
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6253
6410
  const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6254
6411
 
6255
- GGML_ASSERT(src0_dd_i != nullptr);
6412
+ GGML_ASSERT(src0_dd_i != nullptr);
6256
6413
  GGML_ASSERT(src1_ddf_i != nullptr);
6257
- GGML_ASSERT(dst_dd_i != nullptr);
6258
-
6414
+ GGML_ASSERT(dst_dd_i != nullptr);
6259
6415
 
6260
6416
  const int64_t ne00 = src0->ne[0];
6261
-
6262
6417
  const int64_t ne10 = src1->ne[0];
6263
6418
 
6264
6419
  const int64_t ne0 = dst->ne[0];
6420
+
6265
6421
  const int64_t row_diff = row_high - row_low;
6266
6422
 
6267
6423
  int id;
@@ -6346,7 +6502,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6346
6502
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6347
6503
  row_diff, src1_ncols, ne10,
6348
6504
  &alpha, src0_ddf_i, ne00,
6349
- src1_ddf_i, ne10,
6505
+ src1_ddf_i, ne10,
6350
6506
  &beta, dst_dd_i, ldc));
6351
6507
 
6352
6508
  if (src0_as != 0) {
@@ -6372,17 +6528,20 @@ inline void ggml_cuda_op_rope(
6372
6528
  const int64_t ne2 = dst->ne[2];
6373
6529
  const int64_t nrows = ggml_nrows(src0);
6374
6530
 
6375
- //const int n_past = ((int32_t *) dst->op_params)[0];
6376
- const int n_dims = ((int32_t *) dst->op_params)[1];
6377
- const int mode = ((int32_t *) dst->op_params)[2];
6378
- const int n_ctx = ((int32_t *) dst->op_params)[3];
6379
- // RoPE alteration for extended context
6380
-
6381
- float freq_base, freq_scale;
6382
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
6383
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
6531
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6532
+ const int n_dims = ((int32_t *) dst->op_params)[1];
6533
+ const int mode = ((int32_t *) dst->op_params)[2];
6534
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
6535
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
6384
6536
 
6385
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
6537
+ // RoPE alteration for extended context
6538
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
6539
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
6540
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
6541
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
6542
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
6543
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
6544
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
6386
6545
 
6387
6546
  const int32_t * pos = nullptr;
6388
6547
  if ((mode & 1) == 0) {
@@ -6394,24 +6553,39 @@ inline void ggml_cuda_op_rope(
6394
6553
  const bool is_neox = mode & 2;
6395
6554
  const bool is_glm = mode & 4;
6396
6555
 
6556
+ rope_corr_dims corr_dims;
6557
+ ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
6558
+
6397
6559
  // compute
6398
6560
  if (is_glm) {
6399
6561
  GGML_ASSERT(false);
6400
- rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6562
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6401
6563
  } else if (is_neox) {
6402
6564
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6403
6565
  if (src0->type == GGML_TYPE_F32) {
6404
- rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6566
+ rope_neox_cuda(
6567
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6568
+ attn_factor, corr_dims, main_stream
6569
+ );
6405
6570
  } else if (src0->type == GGML_TYPE_F16) {
6406
- rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6571
+ rope_neox_cuda(
6572
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6573
+ attn_factor, corr_dims, main_stream
6574
+ );
6407
6575
  } else {
6408
6576
  GGML_ASSERT(false);
6409
6577
  }
6410
6578
  } else {
6411
6579
  if (src0->type == GGML_TYPE_F32) {
6412
- rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6580
+ rope_cuda(
6581
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6582
+ attn_factor, corr_dims, main_stream
6583
+ );
6413
6584
  } else if (src0->type == GGML_TYPE_F16) {
6414
- rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6585
+ rope_cuda(
6586
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6587
+ attn_factor, corr_dims, main_stream
6588
+ );
6415
6589
  } else {
6416
6590
  GGML_ASSERT(false);
6417
6591
  }
@@ -6522,8 +6696,10 @@ inline void ggml_cuda_op_clamp(
6522
6696
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6523
6697
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6524
6698
 
6525
- const float min = ((float *) dst->op_params)[0];
6526
- const float max = ((float *) dst->op_params)[1];
6699
+ float min;
6700
+ float max;
6701
+ memcpy(&min, dst->op_params, sizeof(float));
6702
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
6527
6703
 
6528
6704
  clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6529
6705
  CUDA_CHECK(cudaGetLastError());
@@ -7013,7 +7189,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
7013
7189
  }
7014
7190
 
7015
7191
  static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
7016
- GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
7192
+ GGML_ASSERT(!ggml_is_transposed(src0));
7193
+ GGML_ASSERT(!ggml_is_transposed(src1));
7017
7194
  GGML_ASSERT(!ggml_is_permuted(src0));
7018
7195
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
7019
7196
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -7023,11 +7200,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7023
7200
  const int64_t ne01 = src0->ne[1];
7024
7201
  const int64_t ne02 = src0->ne[2];
7025
7202
 
7026
- const int64_t ne12 = src1->ne[2];
7027
-
7028
7203
  const int64_t nb01 = src0->nb[1];
7029
7204
  const int64_t nb02 = src0->nb[2];
7030
7205
 
7206
+ const int64_t ne12 = src1->ne[2];
7207
+
7031
7208
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7032
7209
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7033
7210
 
@@ -7046,27 +7223,210 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7046
7223
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7047
7224
  }
7048
7225
 
7226
+ __global__ void k_compute_batched_ptrs(
7227
+ const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7228
+ void ** ptrs,
7229
+ int ne12, int ne13,
7230
+ int ne23,
7231
+ int nb02, int nb03,
7232
+ int nb12, int nb13,
7233
+ int nb2, int nb3,
7234
+ int r2, int r3) {
7235
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
7236
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
7237
+
7238
+ if (i13 >= ne13 || i12 >= ne12) {
7239
+ return;
7240
+ }
7241
+
7242
+ int i03 = i13 / r3;
7243
+ int i02 = i12 / r2;
7244
+
7245
+ ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
7246
+ ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7247
+ ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7248
+ }
7249
+
7250
+ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7251
+ GGML_ASSERT(!ggml_is_transposed(src0));
7252
+ GGML_ASSERT(!ggml_is_transposed(src1));
7253
+
7254
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
7255
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
7256
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7257
+
7258
+ const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
7259
+ const int64_t ne01 = src0->ne[1];
7260
+ const int64_t ne02 = src0->ne[2];
7261
+ const int64_t ne03 = src0->ne[3];
7262
+
7263
+ const int64_t nb01 = src0->nb[1];
7264
+ const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
7265
+ const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
7266
+
7267
+ const int64_t ne10 = src1->ne[0];
7268
+ const int64_t ne11 = src1->ne[1];
7269
+ const int64_t ne12 = src1->ne[2];
7270
+ const int64_t ne13 = src1->ne[3];
7271
+
7272
+ const int64_t nb11 = src1->nb[1];
7273
+ const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
7274
+ const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
7275
+
7276
+ const int64_t ne1 = ggml_nelements(src1);
7277
+ const int64_t ne = ggml_nelements(dst);
7278
+
7279
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7280
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7281
+
7282
+ int id;
7283
+ CUDA_CHECK(cudaGetDevice(&id));
7284
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
7285
+
7286
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7287
+ void * src0_ddq = src0_extra->data_device[g_main_device];
7288
+ half * src0_as_f16 = (half *) src0_ddq;
7289
+
7290
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7291
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
7292
+
7293
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7294
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
7295
+
7296
+ // convert src1 to fp16
7297
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
7298
+ GGML_ASSERT(to_fp16_cuda != nullptr);
7299
+
7300
+ size_t src1_as = 0;
7301
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
7302
+ to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
7303
+
7304
+ size_t dst_as = 0;
7305
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
7306
+
7307
+ GGML_ASSERT(ne12 % ne02 == 0);
7308
+ GGML_ASSERT(ne13 % ne03 == 0);
7309
+
7310
+ // broadcast factors
7311
+ const int64_t r2 = ne12/ne02;
7312
+ const int64_t r3 = ne13/ne03;
7313
+
7314
+ const half alpha_f16 = 1.0f;
7315
+ const half beta_f16 = 0.0f;
7316
+
7317
+ #if 0
7318
+ // use cublasGemmEx
7319
+ {
7320
+ for (int i13 = 0; i13 < ne13; ++i13) {
7321
+ for (int i12 = 0; i12 < ne12; ++i12) {
7322
+ int i03 = i13 / r3;
7323
+ int i02 = i12 / r2;
7324
+
7325
+ CUBLAS_CHECK(
7326
+ cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7327
+ ne01, ne11, ne10,
7328
+ &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
7329
+ (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
7330
+ &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
7331
+ CUBLAS_COMPUTE_16F,
7332
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7333
+ }
7334
+ }
7335
+ }
7336
+ #else
7337
+ if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
7338
+ // there is no broadcast and src0, src1 are contiguous across dims 2, 3
7339
+ // use cublasGemmStridedBatchedEx
7340
+ CUBLAS_CHECK(
7341
+ cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7342
+ ne01, ne11, ne10,
7343
+ &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
7344
+ (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
7345
+ &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
7346
+ ne12*ne13,
7347
+ CUBLAS_COMPUTE_16F,
7348
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7349
+ } else {
7350
+ // use cublasGemmBatchedEx
7351
+ const int ne23 = ne12*ne13;
7352
+
7353
+ void ** ptrs_as = nullptr;
7354
+ size_t ptrs_s = 0;
7355
+ ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7356
+
7357
+ dim3 block_dims(ne13, ne12);
7358
+ k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7359
+ src0_as_f16, src1_as_f16, dst_f16,
7360
+ ptrs_as,
7361
+ ne12, ne13,
7362
+ ne23,
7363
+ nb02, nb03,
7364
+ nb12, nb13,
7365
+ dst->nb[2], dst->nb[3],
7366
+ r2, r3);
7367
+ CUDA_CHECK(cudaGetLastError());
7368
+
7369
+ CUBLAS_CHECK(
7370
+ cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7371
+ ne01, ne11, ne10,
7372
+ &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7373
+ (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7374
+ &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7375
+ ne23,
7376
+ CUBLAS_COMPUTE_16F,
7377
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7378
+
7379
+ ggml_cuda_pool_free(ptrs_as, ptrs_s);
7380
+ }
7381
+ #endif
7382
+
7383
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
7384
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
7385
+
7386
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
7387
+ ggml_cuda_pool_free(dst_f16, dst_as);
7388
+ }
7389
+
7049
7390
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7050
- bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7051
- src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
7391
+ const bool all_on_device =
7392
+ (src0->backend == GGML_BACKEND_GPU) &&
7393
+ (src1->backend == GGML_BACKEND_GPU) &&
7394
+ ( dst->backend == GGML_BACKEND_GPU);
7052
7395
 
7053
7396
  int64_t min_compute_capability = INT_MAX;
7054
7397
  for (int64_t id = 0; id < g_device_count; ++id) {
7055
- if (min_compute_capability > g_compute_capabilities[id]
7056
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7398
+ if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7057
7399
  min_compute_capability = g_compute_capabilities[id];
7058
7400
  }
7059
7401
  }
7060
7402
 
7061
- if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7403
+ #ifdef CUDA_USE_TENSOR_CORES
7404
+ const bool use_tensor_cores = true;
7405
+ #else
7406
+ const bool use_tensor_cores = false;
7407
+ #endif
7408
+
7409
+ // debug helpers
7410
+ //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
7411
+ //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
7412
+ //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
7413
+ //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
7414
+ //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7415
+ //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7416
+
7417
+ if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7418
+ // KQ single-batch
7062
7419
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7063
- } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
7420
+ } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7421
+ // KQV single-batch
7064
7422
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7423
+ } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7424
+ // KQ + KQV multi-batch
7425
+ ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7065
7426
  } else if (src0->type == GGML_TYPE_F32) {
7066
7427
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
7067
7428
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
7068
7429
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
7069
-
7070
7430
  #ifdef GGML_CUDA_FORCE_DMMV
7071
7431
  const bool use_mul_mat_vec_q = false;
7072
7432
  #else
@@ -7079,7 +7439,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7079
7439
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
7080
7440
  }
7081
7441
  } else {
7082
- if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
7442
+ bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
7443
+
7444
+ // when tensor cores are available, use them for large batch size
7445
+ // ref: https://github.com/ggerganov/llama.cpp/pull/3776
7446
+ if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
7447
+ use_mul_mat_q = false;
7448
+ }
7449
+
7450
+ if (use_mul_mat_q) {
7083
7451
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
7084
7452
  } else {
7085
7453
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -7433,10 +7801,6 @@ void ggml_cuda_set_main_device(const int main_device) {
7433
7801
  }
7434
7802
  }
7435
7803
 
7436
- void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7437
- g_mul_mat_q = mul_mat_q;
7438
- }
7439
-
7440
7804
  void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7441
7805
  // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7442
7806
  // it still won't always work as expected, but it's better than nothing