llama_cpp 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -87,6 +87,24 @@
87
87
  #define CC_OFFSET_AMD 1000000
88
88
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
89
89
 
90
+ // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
+ // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
+ // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
93
+ // - 7B quantum model: +100-200 MB
94
+ // - 13B quantum model: +200-400 MB
95
+ //
96
+ //#define GGML_CUDA_FORCE_MMQ
97
+
98
+ // TODO: improve this to be correct for more hardware
99
+ // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
100
+ // probably other such cases, and not sure what happens on AMD hardware
101
+ #if !defined(GGML_CUDA_FORCE_MMQ)
102
+ #define CUDA_USE_TENSOR_CORES
103
+ #endif
104
+
105
+ // max batch size to use MMQ kernels when tensor cores are available
106
+ #define MMQ_MAX_BATCH_SIZE 32
107
+
90
108
  #if defined(GGML_USE_HIPBLAS)
91
109
  #define __CUDA_ARCH__ 1300
92
110
 
@@ -470,7 +488,6 @@ static int g_device_count = -1;
470
488
  static int g_main_device = 0;
471
489
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
472
490
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
473
- static bool g_mul_mat_q = true;
474
491
 
475
492
  static void * g_scratch_buffer = nullptr;
476
493
  static size_t g_scratch_size = 0; // disabled by default
@@ -496,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
496
513
  dst[i] = __hadd(x[i], __float2half(y[i]));
497
514
  }
498
515
 
516
+ static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
517
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
518
+
519
+ if (i >= k) {
520
+ return;
521
+ }
522
+ dst[i] = __half2float(x[i]) + y[i];
523
+ }
524
+
499
525
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
500
526
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
501
527
 
@@ -3554,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
3554
3580
  #define MMQ_X_Q4_0_RDNA1 64
3555
3581
  #define MMQ_Y_Q4_0_RDNA1 64
3556
3582
  #define NWARPS_Q4_0_RDNA1 8
3583
+ #if defined(CUDA_USE_TENSOR_CORES)
3584
+ #define MMQ_X_Q4_0_AMPERE 4
3585
+ #define MMQ_Y_Q4_0_AMPERE 32
3586
+ #define NWARPS_Q4_0_AMPERE 4
3587
+ #else
3557
3588
  #define MMQ_X_Q4_0_AMPERE 64
3558
3589
  #define MMQ_Y_Q4_0_AMPERE 128
3559
3590
  #define NWARPS_Q4_0_AMPERE 4
3591
+ #endif
3560
3592
  #define MMQ_X_Q4_0_PASCAL 64
3561
3593
  #define MMQ_Y_Q4_0_PASCAL 64
3562
3594
  #define NWARPS_Q4_0_PASCAL 8
@@ -3615,9 +3647,15 @@ template <bool need_check> static __global__ void
3615
3647
  #define MMQ_X_Q4_1_RDNA1 64
3616
3648
  #define MMQ_Y_Q4_1_RDNA1 64
3617
3649
  #define NWARPS_Q4_1_RDNA1 8
3650
+ #if defined(CUDA_USE_TENSOR_CORES)
3651
+ #define MMQ_X_Q4_1_AMPERE 4
3652
+ #define MMQ_Y_Q4_1_AMPERE 32
3653
+ #define NWARPS_Q4_1_AMPERE 4
3654
+ #else
3618
3655
  #define MMQ_X_Q4_1_AMPERE 64
3619
3656
  #define MMQ_Y_Q4_1_AMPERE 128
3620
3657
  #define NWARPS_Q4_1_AMPERE 4
3658
+ #endif
3621
3659
  #define MMQ_X_Q4_1_PASCAL 64
3622
3660
  #define MMQ_Y_Q4_1_PASCAL 64
3623
3661
  #define NWARPS_Q4_1_PASCAL 8
@@ -3678,9 +3716,15 @@ template <bool need_check> static __global__ void
3678
3716
  #define MMQ_X_Q5_0_RDNA1 64
3679
3717
  #define MMQ_Y_Q5_0_RDNA1 64
3680
3718
  #define NWARPS_Q5_0_RDNA1 8
3719
+ #if defined(CUDA_USE_TENSOR_CORES)
3720
+ #define MMQ_X_Q5_0_AMPERE 4
3721
+ #define MMQ_Y_Q5_0_AMPERE 32
3722
+ #define NWARPS_Q5_0_AMPERE 4
3723
+ #else
3681
3724
  #define MMQ_X_Q5_0_AMPERE 128
3682
3725
  #define MMQ_Y_Q5_0_AMPERE 64
3683
3726
  #define NWARPS_Q5_0_AMPERE 4
3727
+ #endif
3684
3728
  #define MMQ_X_Q5_0_PASCAL 64
3685
3729
  #define MMQ_Y_Q5_0_PASCAL 64
3686
3730
  #define NWARPS_Q5_0_PASCAL 8
@@ -3739,9 +3783,15 @@ template <bool need_check> static __global__ void
3739
3783
  #define MMQ_X_Q5_1_RDNA1 64
3740
3784
  #define MMQ_Y_Q5_1_RDNA1 64
3741
3785
  #define NWARPS_Q5_1_RDNA1 8
3786
+ #if defined(CUDA_USE_TENSOR_CORES)
3787
+ #define MMQ_X_Q5_1_AMPERE 4
3788
+ #define MMQ_Y_Q5_1_AMPERE 32
3789
+ #define NWARPS_Q5_1_AMPERE 4
3790
+ #else
3742
3791
  #define MMQ_X_Q5_1_AMPERE 128
3743
3792
  #define MMQ_Y_Q5_1_AMPERE 64
3744
3793
  #define NWARPS_Q5_1_AMPERE 4
3794
+ #endif
3745
3795
  #define MMQ_X_Q5_1_PASCAL 64
3746
3796
  #define MMQ_Y_Q5_1_PASCAL 64
3747
3797
  #define NWARPS_Q5_1_PASCAL 8
@@ -3800,9 +3850,15 @@ mul_mat_q5_1(
3800
3850
  #define MMQ_X_Q8_0_RDNA1 64
3801
3851
  #define MMQ_Y_Q8_0_RDNA1 64
3802
3852
  #define NWARPS_Q8_0_RDNA1 8
3853
+ #if defined(CUDA_USE_TENSOR_CORES)
3854
+ #define MMQ_X_Q8_0_AMPERE 4
3855
+ #define MMQ_Y_Q8_0_AMPERE 32
3856
+ #define NWARPS_Q8_0_AMPERE 4
3857
+ #else
3803
3858
  #define MMQ_X_Q8_0_AMPERE 128
3804
3859
  #define MMQ_Y_Q8_0_AMPERE 64
3805
3860
  #define NWARPS_Q8_0_AMPERE 4
3861
+ #endif
3806
3862
  #define MMQ_X_Q8_0_PASCAL 64
3807
3863
  #define MMQ_Y_Q8_0_PASCAL 64
3808
3864
  #define NWARPS_Q8_0_PASCAL 8
@@ -3861,9 +3917,15 @@ template <bool need_check> static __global__ void
3861
3917
  #define MMQ_X_Q2_K_RDNA1 128
3862
3918
  #define MMQ_Y_Q2_K_RDNA1 32
3863
3919
  #define NWARPS_Q2_K_RDNA1 8
3920
+ #if defined(CUDA_USE_TENSOR_CORES)
3921
+ #define MMQ_X_Q2_K_AMPERE 4
3922
+ #define MMQ_Y_Q2_K_AMPERE 32
3923
+ #define NWARPS_Q2_K_AMPERE 4
3924
+ #else
3864
3925
  #define MMQ_X_Q2_K_AMPERE 64
3865
3926
  #define MMQ_Y_Q2_K_AMPERE 128
3866
3927
  #define NWARPS_Q2_K_AMPERE 4
3928
+ #endif
3867
3929
  #define MMQ_X_Q2_K_PASCAL 64
3868
3930
  #define MMQ_Y_Q2_K_PASCAL 64
3869
3931
  #define NWARPS_Q2_K_PASCAL 8
@@ -3922,9 +3984,15 @@ mul_mat_q2_K(
3922
3984
  #define MMQ_X_Q3_K_RDNA1 32
3923
3985
  #define MMQ_Y_Q3_K_RDNA1 128
3924
3986
  #define NWARPS_Q3_K_RDNA1 8
3987
+ #if defined(CUDA_USE_TENSOR_CORES)
3988
+ #define MMQ_X_Q3_K_AMPERE 4
3989
+ #define MMQ_Y_Q3_K_AMPERE 32
3990
+ #define NWARPS_Q3_K_AMPERE 4
3991
+ #else
3925
3992
  #define MMQ_X_Q3_K_AMPERE 128
3926
3993
  #define MMQ_Y_Q3_K_AMPERE 128
3927
3994
  #define NWARPS_Q3_K_AMPERE 4
3995
+ #endif
3928
3996
  #define MMQ_X_Q3_K_PASCAL 64
3929
3997
  #define MMQ_Y_Q3_K_PASCAL 64
3930
3998
  #define NWARPS_Q3_K_PASCAL 8
@@ -3985,9 +4053,15 @@ template <bool need_check> static __global__ void
3985
4053
  #define MMQ_X_Q4_K_RDNA1 32
3986
4054
  #define MMQ_Y_Q4_K_RDNA1 64
3987
4055
  #define NWARPS_Q4_K_RDNA1 8
4056
+ #if defined(CUDA_USE_TENSOR_CORES)
4057
+ #define MMQ_X_Q4_K_AMPERE 4
4058
+ #define MMQ_Y_Q4_K_AMPERE 32
4059
+ #define NWARPS_Q4_K_AMPERE 4
4060
+ #else
3988
4061
  #define MMQ_X_Q4_K_AMPERE 64
3989
4062
  #define MMQ_Y_Q4_K_AMPERE 128
3990
4063
  #define NWARPS_Q4_K_AMPERE 4
4064
+ #endif
3991
4065
  #define MMQ_X_Q4_K_PASCAL 64
3992
4066
  #define MMQ_Y_Q4_K_PASCAL 64
3993
4067
  #define NWARPS_Q4_K_PASCAL 8
@@ -4048,9 +4122,15 @@ template <bool need_check> static __global__ void
4048
4122
  #define MMQ_X_Q5_K_RDNA1 32
4049
4123
  #define MMQ_Y_Q5_K_RDNA1 64
4050
4124
  #define NWARPS_Q5_K_RDNA1 8
4125
+ #if defined(CUDA_USE_TENSOR_CORES)
4126
+ #define MMQ_X_Q5_K_AMPERE 4
4127
+ #define MMQ_Y_Q5_K_AMPERE 32
4128
+ #define NWARPS_Q5_K_AMPERE 4
4129
+ #else
4051
4130
  #define MMQ_X_Q5_K_AMPERE 64
4052
4131
  #define MMQ_Y_Q5_K_AMPERE 128
4053
4132
  #define NWARPS_Q5_K_AMPERE 4
4133
+ #endif
4054
4134
  #define MMQ_X_Q5_K_PASCAL 64
4055
4135
  #define MMQ_Y_Q5_K_PASCAL 64
4056
4136
  #define NWARPS_Q5_K_PASCAL 8
@@ -4109,9 +4189,15 @@ mul_mat_q5_K(
4109
4189
  #define MMQ_X_Q6_K_RDNA1 32
4110
4190
  #define MMQ_Y_Q6_K_RDNA1 64
4111
4191
  #define NWARPS_Q6_K_RDNA1 8
4192
+ #if defined(CUDA_USE_TENSOR_CORES)
4193
+ #define MMQ_X_Q6_K_AMPERE 4
4194
+ #define MMQ_Y_Q6_K_AMPERE 32
4195
+ #define NWARPS_Q6_K_AMPERE 4
4196
+ #else
4112
4197
  #define MMQ_X_Q6_K_AMPERE 64
4113
4198
  #define MMQ_Y_Q6_K_AMPERE 64
4114
4199
  #define NWARPS_Q6_K_AMPERE 4
4200
+ #endif
4115
4201
  #define MMQ_X_Q6_K_PASCAL 64
4116
4202
  #define MMQ_Y_Q6_K_PASCAL 64
4117
4203
  #define NWARPS_Q6_K_PASCAL 8
@@ -4407,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4407
4493
  cpy_1(cx + x_offset, cdst + dst_offset);
4408
4494
  }
4409
4495
 
4410
- // rope == RoPE == rotary positional embedding
4496
+ static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4497
+ const float y = (i0 / 2 - low) / max(0.001f, high - low);
4498
+ return 1.0f - min(1.0f, max(0.0f, y));
4499
+ }
4500
+
4501
+ struct rope_corr_dims {
4502
+ float v[4];
4503
+ };
4504
+
4505
+ // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
4506
+ // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
4507
+ static __device__ void rope_yarn(
4508
+ float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
4509
+ float * cos_theta, float * sin_theta
4510
+ ) {
4511
+ // Get n-d rotational scaling corrected for extrapolation
4512
+ float theta_interp = freq_scale * theta_extrap;
4513
+ float theta = theta_interp;
4514
+ if (ext_factor != 0.0f) {
4515
+ float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
4516
+ theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
4517
+
4518
+ // Get n-d magnitude scaling corrected for interpolation
4519
+ mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
4520
+ }
4521
+ *cos_theta = cosf(theta) * mscale;
4522
+ *sin_theta = sinf(theta) * mscale;
4523
+ }
4411
4524
 
4525
+ // rope == RoPE == rotary positional embedding
4412
4526
  template<typename T, bool has_pos>
4413
- static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4414
- const int p_delta_rows, const float theta_scale) {
4527
+ static __global__ void rope(
4528
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4529
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4530
+ ) {
4415
4531
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4416
4532
 
4417
4533
  if (col >= ncols) {
@@ -4423,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4423
4539
  const int i2 = row/p_delta_rows;
4424
4540
 
4425
4541
  const int p = has_pos ? pos[i2] : 0;
4426
- const float p0 = p*freq_scale;
4427
- const float theta = p0*powf(theta_scale, col/2);
4428
- const float sin_theta = sinf(theta);
4429
- const float cos_theta = cosf(theta);
4542
+ const float theta_base = p*powf(freq_base, -float(col)/ncols);
4543
+
4544
+ float cos_theta, sin_theta;
4545
+ rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
4430
4546
 
4431
4547
  const float x0 = x[i + 0];
4432
4548
  const float x1 = x[i + 1];
@@ -4436,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4436
4552
  }
4437
4553
 
4438
4554
  template<typename T, bool has_pos>
4439
- static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4440
- const int p_delta_rows, const float theta_scale) {
4555
+ static __global__ void rope_neox(
4556
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4557
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4558
+ ) {
4441
4559
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4442
4560
 
4443
4561
  if (col >= ncols) {
@@ -4448,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4448
4566
  const int i = row*ncols + col/2;
4449
4567
  const int i2 = row/p_delta_rows;
4450
4568
 
4569
+ // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4570
+ const float cur_rot = -float(col)/ncols;
4571
+
4451
4572
  const int p = has_pos ? pos[i2] : 0;
4452
- const float p0 = p*freq_scale;
4453
- const float theta = p0*powf(theta_scale, col/2);
4454
- const float sin_theta = sinf(theta);
4455
- const float cos_theta = cosf(theta);
4573
+ const float theta_base = p*powf(freq_base, cur_rot);
4574
+
4575
+ float cos_theta, sin_theta;
4576
+ rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4456
4577
 
4457
4578
  const float x0 = x[i + 0];
4458
4579
  const float x1 = x[i + ncols/2];
@@ -4461,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4461
4582
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4462
4583
  }
4463
4584
 
4464
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
4465
- const int p_delta_rows, const float theta_scale, const int n_ctx) {
4585
+ static __global__ void rope_glm_f32(
4586
+ const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4587
+ int n_ctx
4588
+ ) {
4466
4589
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4467
4590
  const int half_n_dims = ncols/4;
4468
4591
 
@@ -4474,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4474
4597
  const int i = row*ncols + col;
4475
4598
  const int i2 = row/p_delta_rows;
4476
4599
 
4477
- const float col_theta_scale = powf(theta_scale, col);
4600
+ const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
4478
4601
  // FIXME: this is likely wrong
4479
4602
  const int p = pos != nullptr ? pos[i2] : 0;
4480
4603
 
@@ -4616,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
4616
4739
  add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4617
4740
  }
4618
4741
 
4742
+ static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4743
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4744
+ add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4745
+ }
4746
+
4619
4747
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4620
4748
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4621
4749
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5493,40 +5621,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
5493
5621
  }
5494
5622
 
5495
5623
  template<typename T>
5496
- static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5497
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5624
+ static void rope_cuda(
5625
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5626
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5627
+ ) {
5498
5628
  GGML_ASSERT(ncols % 2 == 0);
5499
5629
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5500
5630
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5501
5631
  const dim3 block_nums(nrows, num_blocks_x, 1);
5502
5632
  if (pos == nullptr) {
5503
- rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5633
+ rope<T, false><<<block_nums, block_dims, 0, stream>>>(
5634
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5635
+ );
5504
5636
  } else {
5505
- rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5637
+ rope<T, true><<<block_nums, block_dims, 0, stream>>>(
5638
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5639
+ );
5506
5640
  }
5507
5641
  }
5508
5642
 
5509
5643
  template<typename T>
5510
- static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5511
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5644
+ static void rope_neox_cuda(
5645
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5646
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5647
+ ) {
5512
5648
  GGML_ASSERT(ncols % 2 == 0);
5513
5649
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5514
5650
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5515
5651
  const dim3 block_nums(nrows, num_blocks_x, 1);
5516
5652
  if (pos == nullptr) {
5517
- rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5653
+ rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5654
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5655
+ );
5518
5656
  } else {
5519
- rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5657
+ rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5658
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5659
+ );
5520
5660
  }
5521
5661
  }
5522
5662
 
5523
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5524
- const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5663
+ static void rope_glm_f32_cuda(
5664
+ const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5665
+ float freq_base, int n_ctx, cudaStream_t stream
5666
+ ) {
5525
5667
  GGML_ASSERT(ncols % 4 == 0);
5526
5668
  const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5527
5669
  const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
5528
5670
  const dim3 block_nums(num_blocks_x, nrows, 1);
5529
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
5671
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
5530
5672
  }
5531
5673
 
5532
5674
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5663,6 +5805,16 @@ void ggml_init_cublas() {
5663
5805
  CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5664
5806
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5665
5807
  int64_t total_vram = 0;
5808
+ #if defined(GGML_CUDA_FORCE_MMQ)
5809
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
5810
+ #else
5811
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
5812
+ #endif
5813
+ #if defined(CUDA_USE_TENSOR_CORES)
5814
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
5815
+ #else
5816
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
5817
+ #endif
5666
5818
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5667
5819
  for (int id = 0; id < g_device_count; ++id) {
5668
5820
  cudaDeviceProp prop;
@@ -5909,7 +6061,10 @@ inline void ggml_cuda_op_add(
5909
6061
  add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5910
6062
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5911
6063
  add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6064
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6065
+ add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
5912
6066
  } else {
6067
+ fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
5913
6068
  GGML_ASSERT(false);
5914
6069
  }
5915
6070
 
@@ -6347,7 +6502,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6347
6502
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6348
6503
  row_diff, src1_ncols, ne10,
6349
6504
  &alpha, src0_ddf_i, ne00,
6350
- src1_ddf_i, ne10,
6505
+ src1_ddf_i, ne10,
6351
6506
  &beta, dst_dd_i, ldc));
6352
6507
 
6353
6508
  if (src0_as != 0) {
@@ -6373,17 +6528,20 @@ inline void ggml_cuda_op_rope(
6373
6528
  const int64_t ne2 = dst->ne[2];
6374
6529
  const int64_t nrows = ggml_nrows(src0);
6375
6530
 
6376
- //const int n_past = ((int32_t *) dst->op_params)[0];
6377
- const int n_dims = ((int32_t *) dst->op_params)[1];
6378
- const int mode = ((int32_t *) dst->op_params)[2];
6379
- const int n_ctx = ((int32_t *) dst->op_params)[3];
6380
- // RoPE alteration for extended context
6381
-
6382
- float freq_base, freq_scale;
6383
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
6384
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
6531
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6532
+ const int n_dims = ((int32_t *) dst->op_params)[1];
6533
+ const int mode = ((int32_t *) dst->op_params)[2];
6534
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
6535
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
6385
6536
 
6386
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
6537
+ // RoPE alteration for extended context
6538
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
6539
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
6540
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
6541
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
6542
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
6543
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
6544
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
6387
6545
 
6388
6546
  const int32_t * pos = nullptr;
6389
6547
  if ((mode & 1) == 0) {
@@ -6395,24 +6553,39 @@ inline void ggml_cuda_op_rope(
6395
6553
  const bool is_neox = mode & 2;
6396
6554
  const bool is_glm = mode & 4;
6397
6555
 
6556
+ rope_corr_dims corr_dims;
6557
+ ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
6558
+
6398
6559
  // compute
6399
6560
  if (is_glm) {
6400
6561
  GGML_ASSERT(false);
6401
- rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6562
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6402
6563
  } else if (is_neox) {
6403
6564
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6404
6565
  if (src0->type == GGML_TYPE_F32) {
6405
- rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6566
+ rope_neox_cuda(
6567
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6568
+ attn_factor, corr_dims, main_stream
6569
+ );
6406
6570
  } else if (src0->type == GGML_TYPE_F16) {
6407
- rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6571
+ rope_neox_cuda(
6572
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6573
+ attn_factor, corr_dims, main_stream
6574
+ );
6408
6575
  } else {
6409
6576
  GGML_ASSERT(false);
6410
6577
  }
6411
6578
  } else {
6412
6579
  if (src0->type == GGML_TYPE_F32) {
6413
- rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6580
+ rope_cuda(
6581
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6582
+ attn_factor, corr_dims, main_stream
6583
+ );
6414
6584
  } else if (src0->type == GGML_TYPE_F16) {
6415
- rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6585
+ rope_cuda(
6586
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6587
+ attn_factor, corr_dims, main_stream
6588
+ );
6416
6589
  } else {
6417
6590
  GGML_ASSERT(false);
6418
6591
  }
@@ -6523,8 +6696,10 @@ inline void ggml_cuda_op_clamp(
6523
6696
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6524
6697
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6525
6698
 
6526
- const float min = ((float *) dst->op_params)[0];
6527
- const float max = ((float *) dst->op_params)[1];
6699
+ float min;
6700
+ float max;
6701
+ memcpy(&min, dst->op_params, sizeof(float));
6702
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
6528
6703
 
6529
6704
  clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6530
6705
  CUDA_CHECK(cudaGetLastError());
@@ -7048,9 +7223,34 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7048
7223
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7049
7224
  }
7050
7225
 
7051
- static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
7226
+ __global__ void k_compute_batched_ptrs(
7227
+ const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7228
+ void ** ptrs,
7229
+ int ne12, int ne13,
7230
+ int ne23,
7231
+ int nb02, int nb03,
7232
+ int nb12, int nb13,
7233
+ int nb2, int nb3,
7234
+ int r2, int r3) {
7235
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
7236
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
7237
+
7238
+ if (i13 >= ne13 || i12 >= ne12) {
7239
+ return;
7240
+ }
7241
+
7242
+ int i03 = i13 / r3;
7243
+ int i02 = i12 / r2;
7244
+
7245
+ ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
7246
+ ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7247
+ ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7248
+ }
7249
+
7250
+ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7052
7251
  GGML_ASSERT(!ggml_is_transposed(src0));
7053
7252
  GGML_ASSERT(!ggml_is_transposed(src1));
7253
+
7054
7254
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
7055
7255
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
7056
7256
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -7148,49 +7348,35 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7148
7348
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7149
7349
  } else {
7150
7350
  // use cublasGemmBatchedEx
7151
- // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
7152
7351
  const int ne23 = ne12*ne13;
7153
7352
 
7154
- // TODO: avoid this alloc
7155
- void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
7156
-
7157
- for (int i13 = 0; i13 < ne13; ++i13) {
7158
- for (int i12 = 0; i12 < ne12; ++i12) {
7159
- int i03 = i13 / r3;
7160
- int i02 = i12 / r2;
7161
-
7162
- ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
7163
- ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
7164
- ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
7165
- }
7166
- }
7167
-
7168
- // allocate device memory for pointers
7169
7353
  void ** ptrs_as = nullptr;
7170
- CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
7171
-
7172
- // TODO: this does not work for some reason -- not sure why?
7173
- //size_t ptrs_s = 0;
7174
- //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7175
-
7176
- // copy pointers to device
7177
- CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
7178
-
7179
- free(ptrs);
7354
+ size_t ptrs_s = 0;
7355
+ ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7356
+
7357
+ dim3 block_dims(ne13, ne12);
7358
+ k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7359
+ src0_as_f16, src1_as_f16, dst_f16,
7360
+ ptrs_as,
7361
+ ne12, ne13,
7362
+ ne23,
7363
+ nb02, nb03,
7364
+ nb12, nb13,
7365
+ dst->nb[2], dst->nb[3],
7366
+ r2, r3);
7367
+ CUDA_CHECK(cudaGetLastError());
7180
7368
 
7181
7369
  CUBLAS_CHECK(
7182
7370
  cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7183
7371
  ne01, ne11, ne10,
7184
- &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7185
- (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7186
- &beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7372
+ &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7373
+ (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7374
+ &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7187
7375
  ne23,
7188
7376
  CUBLAS_COMPUTE_16F,
7189
7377
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7190
7378
 
7191
- // free device memory for pointers
7192
- CUDA_CHECK(cudaFree(ptrs_as));
7193
- //ggml_cuda_pool_free(ptrs_as, ptrs_s);
7379
+ ggml_cuda_pool_free(ptrs_as, ptrs_s);
7194
7380
  }
7195
7381
  #endif
7196
7382
 
@@ -7202,17 +7388,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7202
7388
  }
7203
7389
 
7204
7390
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7205
- bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7206
- src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
7391
+ const bool all_on_device =
7392
+ (src0->backend == GGML_BACKEND_GPU) &&
7393
+ (src1->backend == GGML_BACKEND_GPU) &&
7394
+ ( dst->backend == GGML_BACKEND_GPU);
7207
7395
 
7208
7396
  int64_t min_compute_capability = INT_MAX;
7209
7397
  for (int64_t id = 0; id < g_device_count; ++id) {
7210
- if (min_compute_capability > g_compute_capabilities[id]
7211
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7398
+ if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7212
7399
  min_compute_capability = g_compute_capabilities[id];
7213
7400
  }
7214
7401
  }
7215
7402
 
7403
+ #ifdef CUDA_USE_TENSOR_CORES
7404
+ const bool use_tensor_cores = true;
7405
+ #else
7406
+ const bool use_tensor_cores = false;
7407
+ #endif
7408
+
7216
7409
  // debug helpers
7217
7410
  //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
7218
7411
  //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -7221,20 +7414,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7221
7414
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7222
7415
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7223
7416
 
7224
- if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7417
+ if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7225
7418
  // KQ single-batch
7226
7419
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7227
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7420
+ } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7228
7421
  // KQV single-batch
7229
7422
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7230
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
7423
+ } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7231
7424
  // KQ + KQV multi-batch
7232
7425
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7233
7426
  } else if (src0->type == GGML_TYPE_F32) {
7234
7427
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
7235
7428
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
7236
7429
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
7237
-
7238
7430
  #ifdef GGML_CUDA_FORCE_DMMV
7239
7431
  const bool use_mul_mat_vec_q = false;
7240
7432
  #else
@@ -7247,7 +7439,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7247
7439
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
7248
7440
  }
7249
7441
  } else {
7250
- if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
7442
+ bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
7443
+
7444
+ // when tensor cores are available, use them for large batch size
7445
+ // ref: https://github.com/ggerganov/llama.cpp/pull/3776
7446
+ if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
7447
+ use_mul_mat_q = false;
7448
+ }
7449
+
7450
+ if (use_mul_mat_q) {
7251
7451
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
7252
7452
  } else {
7253
7453
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -7601,10 +7801,6 @@ void ggml_cuda_set_main_device(const int main_device) {
7601
7801
  }
7602
7802
  }
7603
7803
 
7604
- void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7605
- g_mul_mat_q = mul_mat_q;
7606
- }
7607
-
7608
7804
  void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7609
7805
  // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7610
7806
  // it still won't always work as expected, but it's better than nothing