llama_cpp 0.8.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,8 @@
29
29
  #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
30
30
  #define cublasCreate hipblasCreate
31
31
  #define cublasGemmEx hipblasGemmEx
32
+ #define cublasGemmBatchedEx hipblasGemmBatchedEx
33
+ #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
32
34
  #define cublasHandle_t hipblasHandle_t
33
35
  #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
34
36
  #define cublasSetStream hipblasSetStream
@@ -85,6 +87,24 @@
85
87
  #define CC_OFFSET_AMD 1000000
86
88
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
87
89
 
90
+ // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
+ // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
+ // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
93
+ // - 7B quantum model: +100-200 MB
94
+ // - 13B quantum model: +200-400 MB
95
+ //
96
+ //#define GGML_CUDA_FORCE_MMQ
97
+
98
+ // TODO: improve this to be correct for more hardware
99
+ // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
100
+ // probably other such cases, and not sure what happens on AMD hardware
101
+ #if !defined(GGML_CUDA_FORCE_MMQ)
102
+ #define CUDA_USE_TENSOR_CORES
103
+ #endif
104
+
105
+ // max batch size to use MMQ kernels when tensor cores are available
106
+ #define MMQ_MAX_BATCH_SIZE 32
107
+
88
108
  #if defined(GGML_USE_HIPBLAS)
89
109
  #define __CUDA_ARCH__ 1300
90
110
 
@@ -468,7 +488,6 @@ static int g_device_count = -1;
468
488
  static int g_main_device = 0;
469
489
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
470
490
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
471
- static bool g_mul_mat_q = true;
472
491
 
473
492
  static void * g_scratch_buffer = nullptr;
474
493
  static size_t g_scratch_size = 0; // disabled by default
@@ -494,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
494
513
  dst[i] = __hadd(x[i], __float2half(y[i]));
495
514
  }
496
515
 
516
+ static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
517
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
518
+
519
+ if (i >= k) {
520
+ return;
521
+ }
522
+ dst[i] = __half2float(x[i]) + y[i];
523
+ }
524
+
497
525
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
498
526
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
499
527
 
@@ -3552,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
3552
3580
  #define MMQ_X_Q4_0_RDNA1 64
3553
3581
  #define MMQ_Y_Q4_0_RDNA1 64
3554
3582
  #define NWARPS_Q4_0_RDNA1 8
3583
+ #if defined(CUDA_USE_TENSOR_CORES)
3584
+ #define MMQ_X_Q4_0_AMPERE 4
3585
+ #define MMQ_Y_Q4_0_AMPERE 32
3586
+ #define NWARPS_Q4_0_AMPERE 4
3587
+ #else
3555
3588
  #define MMQ_X_Q4_0_AMPERE 64
3556
3589
  #define MMQ_Y_Q4_0_AMPERE 128
3557
3590
  #define NWARPS_Q4_0_AMPERE 4
3591
+ #endif
3558
3592
  #define MMQ_X_Q4_0_PASCAL 64
3559
3593
  #define MMQ_Y_Q4_0_PASCAL 64
3560
3594
  #define NWARPS_Q4_0_PASCAL 8
@@ -3613,9 +3647,15 @@ template <bool need_check> static __global__ void
3613
3647
  #define MMQ_X_Q4_1_RDNA1 64
3614
3648
  #define MMQ_Y_Q4_1_RDNA1 64
3615
3649
  #define NWARPS_Q4_1_RDNA1 8
3650
+ #if defined(CUDA_USE_TENSOR_CORES)
3651
+ #define MMQ_X_Q4_1_AMPERE 4
3652
+ #define MMQ_Y_Q4_1_AMPERE 32
3653
+ #define NWARPS_Q4_1_AMPERE 4
3654
+ #else
3616
3655
  #define MMQ_X_Q4_1_AMPERE 64
3617
3656
  #define MMQ_Y_Q4_1_AMPERE 128
3618
3657
  #define NWARPS_Q4_1_AMPERE 4
3658
+ #endif
3619
3659
  #define MMQ_X_Q4_1_PASCAL 64
3620
3660
  #define MMQ_Y_Q4_1_PASCAL 64
3621
3661
  #define NWARPS_Q4_1_PASCAL 8
@@ -3676,9 +3716,15 @@ template <bool need_check> static __global__ void
3676
3716
  #define MMQ_X_Q5_0_RDNA1 64
3677
3717
  #define MMQ_Y_Q5_0_RDNA1 64
3678
3718
  #define NWARPS_Q5_0_RDNA1 8
3719
+ #if defined(CUDA_USE_TENSOR_CORES)
3720
+ #define MMQ_X_Q5_0_AMPERE 4
3721
+ #define MMQ_Y_Q5_0_AMPERE 32
3722
+ #define NWARPS_Q5_0_AMPERE 4
3723
+ #else
3679
3724
  #define MMQ_X_Q5_0_AMPERE 128
3680
3725
  #define MMQ_Y_Q5_0_AMPERE 64
3681
3726
  #define NWARPS_Q5_0_AMPERE 4
3727
+ #endif
3682
3728
  #define MMQ_X_Q5_0_PASCAL 64
3683
3729
  #define MMQ_Y_Q5_0_PASCAL 64
3684
3730
  #define NWARPS_Q5_0_PASCAL 8
@@ -3737,9 +3783,15 @@ template <bool need_check> static __global__ void
3737
3783
  #define MMQ_X_Q5_1_RDNA1 64
3738
3784
  #define MMQ_Y_Q5_1_RDNA1 64
3739
3785
  #define NWARPS_Q5_1_RDNA1 8
3786
+ #if defined(CUDA_USE_TENSOR_CORES)
3787
+ #define MMQ_X_Q5_1_AMPERE 4
3788
+ #define MMQ_Y_Q5_1_AMPERE 32
3789
+ #define NWARPS_Q5_1_AMPERE 4
3790
+ #else
3740
3791
  #define MMQ_X_Q5_1_AMPERE 128
3741
3792
  #define MMQ_Y_Q5_1_AMPERE 64
3742
3793
  #define NWARPS_Q5_1_AMPERE 4
3794
+ #endif
3743
3795
  #define MMQ_X_Q5_1_PASCAL 64
3744
3796
  #define MMQ_Y_Q5_1_PASCAL 64
3745
3797
  #define NWARPS_Q5_1_PASCAL 8
@@ -3798,9 +3850,15 @@ mul_mat_q5_1(
3798
3850
  #define MMQ_X_Q8_0_RDNA1 64
3799
3851
  #define MMQ_Y_Q8_0_RDNA1 64
3800
3852
  #define NWARPS_Q8_0_RDNA1 8
3853
+ #if defined(CUDA_USE_TENSOR_CORES)
3854
+ #define MMQ_X_Q8_0_AMPERE 4
3855
+ #define MMQ_Y_Q8_0_AMPERE 32
3856
+ #define NWARPS_Q8_0_AMPERE 4
3857
+ #else
3801
3858
  #define MMQ_X_Q8_0_AMPERE 128
3802
3859
  #define MMQ_Y_Q8_0_AMPERE 64
3803
3860
  #define NWARPS_Q8_0_AMPERE 4
3861
+ #endif
3804
3862
  #define MMQ_X_Q8_0_PASCAL 64
3805
3863
  #define MMQ_Y_Q8_0_PASCAL 64
3806
3864
  #define NWARPS_Q8_0_PASCAL 8
@@ -3859,9 +3917,15 @@ template <bool need_check> static __global__ void
3859
3917
  #define MMQ_X_Q2_K_RDNA1 128
3860
3918
  #define MMQ_Y_Q2_K_RDNA1 32
3861
3919
  #define NWARPS_Q2_K_RDNA1 8
3920
+ #if defined(CUDA_USE_TENSOR_CORES)
3921
+ #define MMQ_X_Q2_K_AMPERE 4
3922
+ #define MMQ_Y_Q2_K_AMPERE 32
3923
+ #define NWARPS_Q2_K_AMPERE 4
3924
+ #else
3862
3925
  #define MMQ_X_Q2_K_AMPERE 64
3863
3926
  #define MMQ_Y_Q2_K_AMPERE 128
3864
3927
  #define NWARPS_Q2_K_AMPERE 4
3928
+ #endif
3865
3929
  #define MMQ_X_Q2_K_PASCAL 64
3866
3930
  #define MMQ_Y_Q2_K_PASCAL 64
3867
3931
  #define NWARPS_Q2_K_PASCAL 8
@@ -3920,9 +3984,15 @@ mul_mat_q2_K(
3920
3984
  #define MMQ_X_Q3_K_RDNA1 32
3921
3985
  #define MMQ_Y_Q3_K_RDNA1 128
3922
3986
  #define NWARPS_Q3_K_RDNA1 8
3987
+ #if defined(CUDA_USE_TENSOR_CORES)
3988
+ #define MMQ_X_Q3_K_AMPERE 4
3989
+ #define MMQ_Y_Q3_K_AMPERE 32
3990
+ #define NWARPS_Q3_K_AMPERE 4
3991
+ #else
3923
3992
  #define MMQ_X_Q3_K_AMPERE 128
3924
3993
  #define MMQ_Y_Q3_K_AMPERE 128
3925
3994
  #define NWARPS_Q3_K_AMPERE 4
3995
+ #endif
3926
3996
  #define MMQ_X_Q3_K_PASCAL 64
3927
3997
  #define MMQ_Y_Q3_K_PASCAL 64
3928
3998
  #define NWARPS_Q3_K_PASCAL 8
@@ -3983,9 +4053,15 @@ template <bool need_check> static __global__ void
3983
4053
  #define MMQ_X_Q4_K_RDNA1 32
3984
4054
  #define MMQ_Y_Q4_K_RDNA1 64
3985
4055
  #define NWARPS_Q4_K_RDNA1 8
4056
+ #if defined(CUDA_USE_TENSOR_CORES)
4057
+ #define MMQ_X_Q4_K_AMPERE 4
4058
+ #define MMQ_Y_Q4_K_AMPERE 32
4059
+ #define NWARPS_Q4_K_AMPERE 4
4060
+ #else
3986
4061
  #define MMQ_X_Q4_K_AMPERE 64
3987
4062
  #define MMQ_Y_Q4_K_AMPERE 128
3988
4063
  #define NWARPS_Q4_K_AMPERE 4
4064
+ #endif
3989
4065
  #define MMQ_X_Q4_K_PASCAL 64
3990
4066
  #define MMQ_Y_Q4_K_PASCAL 64
3991
4067
  #define NWARPS_Q4_K_PASCAL 8
@@ -4046,9 +4122,15 @@ template <bool need_check> static __global__ void
4046
4122
  #define MMQ_X_Q5_K_RDNA1 32
4047
4123
  #define MMQ_Y_Q5_K_RDNA1 64
4048
4124
  #define NWARPS_Q5_K_RDNA1 8
4125
+ #if defined(CUDA_USE_TENSOR_CORES)
4126
+ #define MMQ_X_Q5_K_AMPERE 4
4127
+ #define MMQ_Y_Q5_K_AMPERE 32
4128
+ #define NWARPS_Q5_K_AMPERE 4
4129
+ #else
4049
4130
  #define MMQ_X_Q5_K_AMPERE 64
4050
4131
  #define MMQ_Y_Q5_K_AMPERE 128
4051
4132
  #define NWARPS_Q5_K_AMPERE 4
4133
+ #endif
4052
4134
  #define MMQ_X_Q5_K_PASCAL 64
4053
4135
  #define MMQ_Y_Q5_K_PASCAL 64
4054
4136
  #define NWARPS_Q5_K_PASCAL 8
@@ -4107,9 +4189,15 @@ mul_mat_q5_K(
4107
4189
  #define MMQ_X_Q6_K_RDNA1 32
4108
4190
  #define MMQ_Y_Q6_K_RDNA1 64
4109
4191
  #define NWARPS_Q6_K_RDNA1 8
4192
+ #if defined(CUDA_USE_TENSOR_CORES)
4193
+ #define MMQ_X_Q6_K_AMPERE 4
4194
+ #define MMQ_Y_Q6_K_AMPERE 32
4195
+ #define NWARPS_Q6_K_AMPERE 4
4196
+ #else
4110
4197
  #define MMQ_X_Q6_K_AMPERE 64
4111
4198
  #define MMQ_Y_Q6_K_AMPERE 64
4112
4199
  #define NWARPS_Q6_K_AMPERE 4
4200
+ #endif
4113
4201
  #define MMQ_X_Q6_K_PASCAL 64
4114
4202
  #define MMQ_Y_Q6_K_PASCAL 64
4115
4203
  #define NWARPS_Q6_K_PASCAL 8
@@ -4326,13 +4414,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
4326
4414
 
4327
4415
  const half * x = (const half *) vx;
4328
4416
 
4329
- const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
4330
- const int channel = blockDim.z*blockIdx.z + threadIdx.z;
4417
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
4418
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
4331
4419
  const int channel_x = channel / channel_x_divisor;
4332
4420
 
4333
- const int nrows_y = ncols_x;
4421
+ const int nrows_y = ncols_x;
4334
4422
  const int nrows_dst = nrows_x;
4335
- const int row_dst = row_x;
4423
+ const int row_dst = row_x;
4336
4424
 
4337
4425
  const int idst = channel*nrows_dst + row_dst;
4338
4426
 
@@ -4345,13 +4433,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
4345
4433
  break;
4346
4434
  }
4347
4435
 
4348
- const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
4349
- const float xi = __half2float(x[ix]);
4350
-
4351
4436
  const int row_y = col_x;
4352
4437
 
4438
+ const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
4353
4439
  const int iy = channel*nrows_y + row_y;
4354
4440
 
4441
+ const float xi = __half2float(x[ix]);
4442
+
4355
4443
  tmp += xi * y[iy];
4356
4444
  }
4357
4445
 
@@ -4405,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4405
4493
  cpy_1(cx + x_offset, cdst + dst_offset);
4406
4494
  }
4407
4495
 
4408
- // rope == RoPE == rotary positional embedding
4496
+ static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4497
+ const float y = (i0 / 2 - low) / max(0.001f, high - low);
4498
+ return 1.0f - min(1.0f, max(0.0f, y));
4499
+ }
4500
+
4501
+ struct rope_corr_dims {
4502
+ float v[4];
4503
+ };
4504
+
4505
+ // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
4506
+ // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
4507
+ static __device__ void rope_yarn(
4508
+ float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
4509
+ float * cos_theta, float * sin_theta
4510
+ ) {
4511
+ // Get n-d rotational scaling corrected for extrapolation
4512
+ float theta_interp = freq_scale * theta_extrap;
4513
+ float theta = theta_interp;
4514
+ if (ext_factor != 0.0f) {
4515
+ float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
4516
+ theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
4517
+
4518
+ // Get n-d magnitude scaling corrected for interpolation
4519
+ mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
4520
+ }
4521
+ *cos_theta = cosf(theta) * mscale;
4522
+ *sin_theta = sinf(theta) * mscale;
4523
+ }
4409
4524
 
4525
+ // rope == RoPE == rotary positional embedding
4410
4526
  template<typename T, bool has_pos>
4411
- static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4412
- const int p_delta_rows, const float theta_scale) {
4527
+ static __global__ void rope(
4528
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4529
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4530
+ ) {
4413
4531
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4414
4532
 
4415
4533
  if (col >= ncols) {
@@ -4421,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4421
4539
  const int i2 = row/p_delta_rows;
4422
4540
 
4423
4541
  const int p = has_pos ? pos[i2] : 0;
4424
- const float p0 = p*freq_scale;
4425
- const float theta = p0*powf(theta_scale, col/2);
4426
- const float sin_theta = sinf(theta);
4427
- const float cos_theta = cosf(theta);
4542
+ const float theta_base = p*powf(freq_base, -float(col)/ncols);
4543
+
4544
+ float cos_theta, sin_theta;
4545
+ rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
4428
4546
 
4429
4547
  const float x0 = x[i + 0];
4430
4548
  const float x1 = x[i + 1];
@@ -4434,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4434
4552
  }
4435
4553
 
4436
4554
  template<typename T, bool has_pos>
4437
- static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4438
- const int p_delta_rows, const float theta_scale) {
4555
+ static __global__ void rope_neox(
4556
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4557
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4558
+ ) {
4439
4559
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4440
4560
 
4441
4561
  if (col >= ncols) {
@@ -4446,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4446
4566
  const int i = row*ncols + col/2;
4447
4567
  const int i2 = row/p_delta_rows;
4448
4568
 
4569
+ // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4570
+ const float cur_rot = -float(col)/ncols;
4571
+
4449
4572
  const int p = has_pos ? pos[i2] : 0;
4450
- const float p0 = p*freq_scale;
4451
- const float theta = p0*powf(theta_scale, col/2);
4452
- const float sin_theta = sinf(theta);
4453
- const float cos_theta = cosf(theta);
4573
+ const float theta_base = p*powf(freq_base, cur_rot);
4574
+
4575
+ float cos_theta, sin_theta;
4576
+ rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4454
4577
 
4455
4578
  const float x0 = x[i + 0];
4456
4579
  const float x1 = x[i + ncols/2];
@@ -4459,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4459
4582
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4460
4583
  }
4461
4584
 
4462
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
4463
- const int p_delta_rows, const float theta_scale, const int n_ctx) {
4585
+ static __global__ void rope_glm_f32(
4586
+ const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4587
+ int n_ctx
4588
+ ) {
4464
4589
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4465
4590
  const int half_n_dims = ncols/4;
4466
4591
 
@@ -4472,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4472
4597
  const int i = row*ncols + col;
4473
4598
  const int i2 = row/p_delta_rows;
4474
4599
 
4475
- const float col_theta_scale = powf(theta_scale, col);
4600
+ const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
4476
4601
  // FIXME: this is likely wrong
4477
4602
  const int p = pos != nullptr ? pos[i2] : 0;
4478
4603
 
@@ -4614,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
4614
4739
  add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4615
4740
  }
4616
4741
 
4742
+ static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4743
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4744
+ add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4745
+ }
4746
+
4617
4747
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4618
4748
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4619
4749
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5491,40 +5621,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
5491
5621
  }
5492
5622
 
5493
5623
  template<typename T>
5494
- static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5495
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5624
+ static void rope_cuda(
5625
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5626
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5627
+ ) {
5496
5628
  GGML_ASSERT(ncols % 2 == 0);
5497
5629
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5498
5630
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5499
5631
  const dim3 block_nums(nrows, num_blocks_x, 1);
5500
5632
  if (pos == nullptr) {
5501
- rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5633
+ rope<T, false><<<block_nums, block_dims, 0, stream>>>(
5634
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5635
+ );
5502
5636
  } else {
5503
- rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5637
+ rope<T, true><<<block_nums, block_dims, 0, stream>>>(
5638
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5639
+ );
5504
5640
  }
5505
5641
  }
5506
5642
 
5507
5643
  template<typename T>
5508
- static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5509
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5644
+ static void rope_neox_cuda(
5645
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5646
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5647
+ ) {
5510
5648
  GGML_ASSERT(ncols % 2 == 0);
5511
5649
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5512
5650
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5513
5651
  const dim3 block_nums(nrows, num_blocks_x, 1);
5514
5652
  if (pos == nullptr) {
5515
- rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5653
+ rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5654
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5655
+ );
5516
5656
  } else {
5517
- rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5657
+ rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5658
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5659
+ );
5518
5660
  }
5519
5661
  }
5520
5662
 
5521
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5522
- const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5663
+ static void rope_glm_f32_cuda(
5664
+ const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5665
+ float freq_base, int n_ctx, cudaStream_t stream
5666
+ ) {
5523
5667
  GGML_ASSERT(ncols % 4 == 0);
5524
5668
  const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5525
5669
  const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
5526
5670
  const dim3 block_nums(num_blocks_x, nrows, 1);
5527
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
5671
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
5528
5672
  }
5529
5673
 
5530
5674
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5661,11 +5805,21 @@ void ggml_init_cublas() {
5661
5805
  CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5662
5806
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5663
5807
  int64_t total_vram = 0;
5808
+ #if defined(GGML_CUDA_FORCE_MMQ)
5809
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
5810
+ #else
5811
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
5812
+ #endif
5813
+ #if defined(CUDA_USE_TENSOR_CORES)
5814
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
5815
+ #else
5816
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
5817
+ #endif
5664
5818
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5665
- for (int64_t id = 0; id < g_device_count; ++id) {
5819
+ for (int id = 0; id < g_device_count; ++id) {
5666
5820
  cudaDeviceProp prop;
5667
5821
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
5668
- fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5822
+ fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5669
5823
 
5670
5824
  g_tensor_split[id] = total_vram;
5671
5825
  total_vram += prop.totalGlobalMem;
@@ -5675,15 +5829,15 @@ void ggml_init_cublas() {
5675
5829
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
5676
5830
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5677
5831
  }
5678
- for (int64_t id = 0; id < g_device_count; ++id) {
5832
+ for (int id = 0; id < g_device_count; ++id) {
5679
5833
  g_tensor_split[id] /= total_vram;
5680
5834
  }
5681
5835
 
5682
- for (int64_t id = 0; id < g_device_count; ++id) {
5836
+ for (int id = 0; id < g_device_count; ++id) {
5683
5837
  CUDA_CHECK(ggml_cuda_set_device(id));
5684
5838
 
5685
5839
  // create cuda streams
5686
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
5840
+ for (int is = 0; is < MAX_STREAMS; ++is) {
5687
5841
  CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
5688
5842
  }
5689
5843
 
@@ -5907,7 +6061,10 @@ inline void ggml_cuda_op_add(
5907
6061
  add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5908
6062
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5909
6063
  add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6064
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6065
+ add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
5910
6066
  } else {
6067
+ fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
5911
6068
  GGML_ASSERT(false);
5912
6069
  }
5913
6070
 
@@ -6252,16 +6409,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
6252
6409
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6253
6410
  const int64_t src1_padded_row_size, const cudaStream_t & stream) {
6254
6411
 
6255
- GGML_ASSERT(src0_dd_i != nullptr);
6412
+ GGML_ASSERT(src0_dd_i != nullptr);
6256
6413
  GGML_ASSERT(src1_ddf_i != nullptr);
6257
- GGML_ASSERT(dst_dd_i != nullptr);
6258
-
6414
+ GGML_ASSERT(dst_dd_i != nullptr);
6259
6415
 
6260
6416
  const int64_t ne00 = src0->ne[0];
6261
-
6262
6417
  const int64_t ne10 = src1->ne[0];
6263
6418
 
6264
6419
  const int64_t ne0 = dst->ne[0];
6420
+
6265
6421
  const int64_t row_diff = row_high - row_low;
6266
6422
 
6267
6423
  int id;
@@ -6346,7 +6502,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6346
6502
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6347
6503
  row_diff, src1_ncols, ne10,
6348
6504
  &alpha, src0_ddf_i, ne00,
6349
- src1_ddf_i, ne10,
6505
+ src1_ddf_i, ne10,
6350
6506
  &beta, dst_dd_i, ldc));
6351
6507
 
6352
6508
  if (src0_as != 0) {
@@ -6372,17 +6528,20 @@ inline void ggml_cuda_op_rope(
6372
6528
  const int64_t ne2 = dst->ne[2];
6373
6529
  const int64_t nrows = ggml_nrows(src0);
6374
6530
 
6375
- //const int n_past = ((int32_t *) dst->op_params)[0];
6376
- const int n_dims = ((int32_t *) dst->op_params)[1];
6377
- const int mode = ((int32_t *) dst->op_params)[2];
6378
- const int n_ctx = ((int32_t *) dst->op_params)[3];
6379
- // RoPE alteration for extended context
6380
-
6381
- float freq_base, freq_scale;
6382
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
6383
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
6531
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6532
+ const int n_dims = ((int32_t *) dst->op_params)[1];
6533
+ const int mode = ((int32_t *) dst->op_params)[2];
6534
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
6535
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
6384
6536
 
6385
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
6537
+ // RoPE alteration for extended context
6538
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
6539
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
6540
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
6541
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
6542
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
6543
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
6544
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
6386
6545
 
6387
6546
  const int32_t * pos = nullptr;
6388
6547
  if ((mode & 1) == 0) {
@@ -6394,24 +6553,39 @@ inline void ggml_cuda_op_rope(
6394
6553
  const bool is_neox = mode & 2;
6395
6554
  const bool is_glm = mode & 4;
6396
6555
 
6556
+ rope_corr_dims corr_dims;
6557
+ ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
6558
+
6397
6559
  // compute
6398
6560
  if (is_glm) {
6399
6561
  GGML_ASSERT(false);
6400
- rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6562
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6401
6563
  } else if (is_neox) {
6402
6564
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6403
6565
  if (src0->type == GGML_TYPE_F32) {
6404
- rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6566
+ rope_neox_cuda(
6567
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6568
+ attn_factor, corr_dims, main_stream
6569
+ );
6405
6570
  } else if (src0->type == GGML_TYPE_F16) {
6406
- rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6571
+ rope_neox_cuda(
6572
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6573
+ attn_factor, corr_dims, main_stream
6574
+ );
6407
6575
  } else {
6408
6576
  GGML_ASSERT(false);
6409
6577
  }
6410
6578
  } else {
6411
6579
  if (src0->type == GGML_TYPE_F32) {
6412
- rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6580
+ rope_cuda(
6581
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6582
+ attn_factor, corr_dims, main_stream
6583
+ );
6413
6584
  } else if (src0->type == GGML_TYPE_F16) {
6414
- rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6585
+ rope_cuda(
6586
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6587
+ attn_factor, corr_dims, main_stream
6588
+ );
6415
6589
  } else {
6416
6590
  GGML_ASSERT(false);
6417
6591
  }
@@ -6522,8 +6696,10 @@ inline void ggml_cuda_op_clamp(
6522
6696
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6523
6697
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6524
6698
 
6525
- const float min = ((float *) dst->op_params)[0];
6526
- const float max = ((float *) dst->op_params)[1];
6699
+ float min;
6700
+ float max;
6701
+ memcpy(&min, dst->op_params, sizeof(float));
6702
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
6527
6703
 
6528
6704
  clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6529
6705
  CUDA_CHECK(cudaGetLastError());
@@ -7013,7 +7189,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
7013
7189
  }
7014
7190
 
7015
7191
  static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
7016
- GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
7192
+ GGML_ASSERT(!ggml_is_transposed(src0));
7193
+ GGML_ASSERT(!ggml_is_transposed(src1));
7017
7194
  GGML_ASSERT(!ggml_is_permuted(src0));
7018
7195
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
7019
7196
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -7023,11 +7200,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7023
7200
  const int64_t ne01 = src0->ne[1];
7024
7201
  const int64_t ne02 = src0->ne[2];
7025
7202
 
7026
- const int64_t ne12 = src1->ne[2];
7027
-
7028
7203
  const int64_t nb01 = src0->nb[1];
7029
7204
  const int64_t nb02 = src0->nb[2];
7030
7205
 
7206
+ const int64_t ne12 = src1->ne[2];
7207
+
7031
7208
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7032
7209
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7033
7210
 
@@ -7046,27 +7223,210 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7046
7223
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7047
7224
  }
7048
7225
 
7226
+ __global__ void k_compute_batched_ptrs(
7227
+ const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7228
+ void ** ptrs,
7229
+ int ne12, int ne13,
7230
+ int ne23,
7231
+ int nb02, int nb03,
7232
+ int nb12, int nb13,
7233
+ int nb2, int nb3,
7234
+ int r2, int r3) {
7235
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
7236
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
7237
+
7238
+ if (i13 >= ne13 || i12 >= ne12) {
7239
+ return;
7240
+ }
7241
+
7242
+ int i03 = i13 / r3;
7243
+ int i02 = i12 / r2;
7244
+
7245
+ ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
7246
+ ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7247
+ ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7248
+ }
7249
+
7250
+ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7251
+ GGML_ASSERT(!ggml_is_transposed(src0));
7252
+ GGML_ASSERT(!ggml_is_transposed(src1));
7253
+
7254
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
7255
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
7256
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7257
+
7258
+ const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
7259
+ const int64_t ne01 = src0->ne[1];
7260
+ const int64_t ne02 = src0->ne[2];
7261
+ const int64_t ne03 = src0->ne[3];
7262
+
7263
+ const int64_t nb01 = src0->nb[1];
7264
+ const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
7265
+ const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
7266
+
7267
+ const int64_t ne10 = src1->ne[0];
7268
+ const int64_t ne11 = src1->ne[1];
7269
+ const int64_t ne12 = src1->ne[2];
7270
+ const int64_t ne13 = src1->ne[3];
7271
+
7272
+ const int64_t nb11 = src1->nb[1];
7273
+ const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
7274
+ const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
7275
+
7276
+ const int64_t ne1 = ggml_nelements(src1);
7277
+ const int64_t ne = ggml_nelements(dst);
7278
+
7279
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7280
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7281
+
7282
+ int id;
7283
+ CUDA_CHECK(cudaGetDevice(&id));
7284
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
7285
+
7286
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7287
+ void * src0_ddq = src0_extra->data_device[g_main_device];
7288
+ half * src0_as_f16 = (half *) src0_ddq;
7289
+
7290
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7291
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
7292
+
7293
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7294
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
7295
+
7296
+ // convert src1 to fp16
7297
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
7298
+ GGML_ASSERT(to_fp16_cuda != nullptr);
7299
+
7300
+ size_t src1_as = 0;
7301
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
7302
+ to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
7303
+
7304
+ size_t dst_as = 0;
7305
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
7306
+
7307
+ GGML_ASSERT(ne12 % ne02 == 0);
7308
+ GGML_ASSERT(ne13 % ne03 == 0);
7309
+
7310
+ // broadcast factors
7311
+ const int64_t r2 = ne12/ne02;
7312
+ const int64_t r3 = ne13/ne03;
7313
+
7314
+ const half alpha_f16 = 1.0f;
7315
+ const half beta_f16 = 0.0f;
7316
+
7317
+ #if 0
7318
+ // use cublasGemmEx
7319
+ {
7320
+ for (int i13 = 0; i13 < ne13; ++i13) {
7321
+ for (int i12 = 0; i12 < ne12; ++i12) {
7322
+ int i03 = i13 / r3;
7323
+ int i02 = i12 / r2;
7324
+
7325
+ CUBLAS_CHECK(
7326
+ cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7327
+ ne01, ne11, ne10,
7328
+ &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
7329
+ (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
7330
+ &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
7331
+ CUBLAS_COMPUTE_16F,
7332
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7333
+ }
7334
+ }
7335
+ }
7336
+ #else
7337
+ if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
7338
+ // there is no broadcast and src0, src1 are contiguous across dims 2, 3
7339
+ // use cublasGemmStridedBatchedEx
7340
+ CUBLAS_CHECK(
7341
+ cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7342
+ ne01, ne11, ne10,
7343
+ &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
7344
+ (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
7345
+ &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
7346
+ ne12*ne13,
7347
+ CUBLAS_COMPUTE_16F,
7348
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7349
+ } else {
7350
+ // use cublasGemmBatchedEx
7351
+ const int ne23 = ne12*ne13;
7352
+
7353
+ void ** ptrs_as = nullptr;
7354
+ size_t ptrs_s = 0;
7355
+ ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7356
+
7357
+ dim3 block_dims(ne13, ne12);
7358
+ k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7359
+ src0_as_f16, src1_as_f16, dst_f16,
7360
+ ptrs_as,
7361
+ ne12, ne13,
7362
+ ne23,
7363
+ nb02, nb03,
7364
+ nb12, nb13,
7365
+ dst->nb[2], dst->nb[3],
7366
+ r2, r3);
7367
+ CUDA_CHECK(cudaGetLastError());
7368
+
7369
+ CUBLAS_CHECK(
7370
+ cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7371
+ ne01, ne11, ne10,
7372
+ &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7373
+ (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7374
+ &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7375
+ ne23,
7376
+ CUBLAS_COMPUTE_16F,
7377
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7378
+
7379
+ ggml_cuda_pool_free(ptrs_as, ptrs_s);
7380
+ }
7381
+ #endif
7382
+
7383
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
7384
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
7385
+
7386
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
7387
+ ggml_cuda_pool_free(dst_f16, dst_as);
7388
+ }
7389
+
7049
7390
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7050
- bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7051
- src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
7391
+ const bool all_on_device =
7392
+ (src0->backend == GGML_BACKEND_GPU) &&
7393
+ (src1->backend == GGML_BACKEND_GPU) &&
7394
+ ( dst->backend == GGML_BACKEND_GPU);
7052
7395
 
7053
7396
  int64_t min_compute_capability = INT_MAX;
7054
7397
  for (int64_t id = 0; id < g_device_count; ++id) {
7055
- if (min_compute_capability > g_compute_capabilities[id]
7056
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7398
+ if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7057
7399
  min_compute_capability = g_compute_capabilities[id];
7058
7400
  }
7059
7401
  }
7060
7402
 
7061
- if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7403
+ #ifdef CUDA_USE_TENSOR_CORES
7404
+ const bool use_tensor_cores = true;
7405
+ #else
7406
+ const bool use_tensor_cores = false;
7407
+ #endif
7408
+
7409
+ // debug helpers
7410
+ //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
7411
+ //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
7412
+ //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
7413
+ //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
7414
+ //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7415
+ //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7416
+
7417
+ if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7418
+ // KQ single-batch
7062
7419
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7063
- } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
7420
+ } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7421
+ // KQV single-batch
7064
7422
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7423
+ } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7424
+ // KQ + KQV multi-batch
7425
+ ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7065
7426
  } else if (src0->type == GGML_TYPE_F32) {
7066
7427
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
7067
7428
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
7068
7429
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
7069
-
7070
7430
  #ifdef GGML_CUDA_FORCE_DMMV
7071
7431
  const bool use_mul_mat_vec_q = false;
7072
7432
  #else
@@ -7079,7 +7439,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7079
7439
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
7080
7440
  }
7081
7441
  } else {
7082
- if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
7442
+ bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
7443
+
7444
+ // when tensor cores are available, use them for large batch size
7445
+ // ref: https://github.com/ggerganov/llama.cpp/pull/3776
7446
+ if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
7447
+ use_mul_mat_q = false;
7448
+ }
7449
+
7450
+ if (use_mul_mat_q) {
7083
7451
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
7084
7452
  } else {
7085
7453
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -7433,10 +7801,6 @@ void ggml_cuda_set_main_device(const int main_device) {
7433
7801
  }
7434
7802
  }
7435
7803
 
7436
- void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7437
- g_mul_mat_q = mul_mat_q;
7438
- }
7439
-
7440
7804
  void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7441
7805
  // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7442
7806
  // it still won't always work as expected, but it's better than nothing