llama_cpp 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -87,6 +87,24 @@
87
87
  #define CC_OFFSET_AMD 1000000
88
88
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
89
89
 
90
+ // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
+ // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
+ // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
93
+ // - 7B quantum model: +100-200 MB
94
+ // - 13B quantum model: +200-400 MB
95
+ //
96
+ //#define GGML_CUDA_FORCE_MMQ
97
+
98
+ // TODO: improve this to be correct for more hardware
99
+ // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
100
+ // probably other such cases, and not sure what happens on AMD hardware
101
+ #if !defined(GGML_CUDA_FORCE_MMQ)
102
+ #define CUDA_USE_TENSOR_CORES
103
+ #endif
104
+
105
+ // max batch size to use MMQ kernels when tensor cores are available
106
+ #define MMQ_MAX_BATCH_SIZE 32
107
+
90
108
  #if defined(GGML_USE_HIPBLAS)
91
109
  #define __CUDA_ARCH__ 1300
92
110
 
@@ -470,7 +488,6 @@ static int g_device_count = -1;
470
488
  static int g_main_device = 0;
471
489
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
472
490
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
473
- static bool g_mul_mat_q = true;
474
491
 
475
492
  static void * g_scratch_buffer = nullptr;
476
493
  static size_t g_scratch_size = 0; // disabled by default
@@ -496,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
496
513
  dst[i] = __hadd(x[i], __float2half(y[i]));
497
514
  }
498
515
 
516
+ static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
517
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
518
+
519
+ if (i >= k) {
520
+ return;
521
+ }
522
+ dst[i] = __half2float(x[i]) + y[i];
523
+ }
524
+
499
525
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
500
526
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
501
527
 
@@ -3554,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
3554
3580
  #define MMQ_X_Q4_0_RDNA1 64
3555
3581
  #define MMQ_Y_Q4_0_RDNA1 64
3556
3582
  #define NWARPS_Q4_0_RDNA1 8
3583
+ #if defined(CUDA_USE_TENSOR_CORES)
3584
+ #define MMQ_X_Q4_0_AMPERE 4
3585
+ #define MMQ_Y_Q4_0_AMPERE 32
3586
+ #define NWARPS_Q4_0_AMPERE 4
3587
+ #else
3557
3588
  #define MMQ_X_Q4_0_AMPERE 64
3558
3589
  #define MMQ_Y_Q4_0_AMPERE 128
3559
3590
  #define NWARPS_Q4_0_AMPERE 4
3591
+ #endif
3560
3592
  #define MMQ_X_Q4_0_PASCAL 64
3561
3593
  #define MMQ_Y_Q4_0_PASCAL 64
3562
3594
  #define NWARPS_Q4_0_PASCAL 8
@@ -3615,9 +3647,15 @@ template <bool need_check> static __global__ void
3615
3647
  #define MMQ_X_Q4_1_RDNA1 64
3616
3648
  #define MMQ_Y_Q4_1_RDNA1 64
3617
3649
  #define NWARPS_Q4_1_RDNA1 8
3650
+ #if defined(CUDA_USE_TENSOR_CORES)
3651
+ #define MMQ_X_Q4_1_AMPERE 4
3652
+ #define MMQ_Y_Q4_1_AMPERE 32
3653
+ #define NWARPS_Q4_1_AMPERE 4
3654
+ #else
3618
3655
  #define MMQ_X_Q4_1_AMPERE 64
3619
3656
  #define MMQ_Y_Q4_1_AMPERE 128
3620
3657
  #define NWARPS_Q4_1_AMPERE 4
3658
+ #endif
3621
3659
  #define MMQ_X_Q4_1_PASCAL 64
3622
3660
  #define MMQ_Y_Q4_1_PASCAL 64
3623
3661
  #define NWARPS_Q4_1_PASCAL 8
@@ -3678,9 +3716,15 @@ template <bool need_check> static __global__ void
3678
3716
  #define MMQ_X_Q5_0_RDNA1 64
3679
3717
  #define MMQ_Y_Q5_0_RDNA1 64
3680
3718
  #define NWARPS_Q5_0_RDNA1 8
3719
+ #if defined(CUDA_USE_TENSOR_CORES)
3720
+ #define MMQ_X_Q5_0_AMPERE 4
3721
+ #define MMQ_Y_Q5_0_AMPERE 32
3722
+ #define NWARPS_Q5_0_AMPERE 4
3723
+ #else
3681
3724
  #define MMQ_X_Q5_0_AMPERE 128
3682
3725
  #define MMQ_Y_Q5_0_AMPERE 64
3683
3726
  #define NWARPS_Q5_0_AMPERE 4
3727
+ #endif
3684
3728
  #define MMQ_X_Q5_0_PASCAL 64
3685
3729
  #define MMQ_Y_Q5_0_PASCAL 64
3686
3730
  #define NWARPS_Q5_0_PASCAL 8
@@ -3739,9 +3783,15 @@ template <bool need_check> static __global__ void
3739
3783
  #define MMQ_X_Q5_1_RDNA1 64
3740
3784
  #define MMQ_Y_Q5_1_RDNA1 64
3741
3785
  #define NWARPS_Q5_1_RDNA1 8
3786
+ #if defined(CUDA_USE_TENSOR_CORES)
3787
+ #define MMQ_X_Q5_1_AMPERE 4
3788
+ #define MMQ_Y_Q5_1_AMPERE 32
3789
+ #define NWARPS_Q5_1_AMPERE 4
3790
+ #else
3742
3791
  #define MMQ_X_Q5_1_AMPERE 128
3743
3792
  #define MMQ_Y_Q5_1_AMPERE 64
3744
3793
  #define NWARPS_Q5_1_AMPERE 4
3794
+ #endif
3745
3795
  #define MMQ_X_Q5_1_PASCAL 64
3746
3796
  #define MMQ_Y_Q5_1_PASCAL 64
3747
3797
  #define NWARPS_Q5_1_PASCAL 8
@@ -3800,9 +3850,15 @@ mul_mat_q5_1(
3800
3850
  #define MMQ_X_Q8_0_RDNA1 64
3801
3851
  #define MMQ_Y_Q8_0_RDNA1 64
3802
3852
  #define NWARPS_Q8_0_RDNA1 8
3853
+ #if defined(CUDA_USE_TENSOR_CORES)
3854
+ #define MMQ_X_Q8_0_AMPERE 4
3855
+ #define MMQ_Y_Q8_0_AMPERE 32
3856
+ #define NWARPS_Q8_0_AMPERE 4
3857
+ #else
3803
3858
  #define MMQ_X_Q8_0_AMPERE 128
3804
3859
  #define MMQ_Y_Q8_0_AMPERE 64
3805
3860
  #define NWARPS_Q8_0_AMPERE 4
3861
+ #endif
3806
3862
  #define MMQ_X_Q8_0_PASCAL 64
3807
3863
  #define MMQ_Y_Q8_0_PASCAL 64
3808
3864
  #define NWARPS_Q8_0_PASCAL 8
@@ -3861,9 +3917,15 @@ template <bool need_check> static __global__ void
3861
3917
  #define MMQ_X_Q2_K_RDNA1 128
3862
3918
  #define MMQ_Y_Q2_K_RDNA1 32
3863
3919
  #define NWARPS_Q2_K_RDNA1 8
3920
+ #if defined(CUDA_USE_TENSOR_CORES)
3921
+ #define MMQ_X_Q2_K_AMPERE 4
3922
+ #define MMQ_Y_Q2_K_AMPERE 32
3923
+ #define NWARPS_Q2_K_AMPERE 4
3924
+ #else
3864
3925
  #define MMQ_X_Q2_K_AMPERE 64
3865
3926
  #define MMQ_Y_Q2_K_AMPERE 128
3866
3927
  #define NWARPS_Q2_K_AMPERE 4
3928
+ #endif
3867
3929
  #define MMQ_X_Q2_K_PASCAL 64
3868
3930
  #define MMQ_Y_Q2_K_PASCAL 64
3869
3931
  #define NWARPS_Q2_K_PASCAL 8
@@ -3922,9 +3984,15 @@ mul_mat_q2_K(
3922
3984
  #define MMQ_X_Q3_K_RDNA1 32
3923
3985
  #define MMQ_Y_Q3_K_RDNA1 128
3924
3986
  #define NWARPS_Q3_K_RDNA1 8
3987
+ #if defined(CUDA_USE_TENSOR_CORES)
3988
+ #define MMQ_X_Q3_K_AMPERE 4
3989
+ #define MMQ_Y_Q3_K_AMPERE 32
3990
+ #define NWARPS_Q3_K_AMPERE 4
3991
+ #else
3925
3992
  #define MMQ_X_Q3_K_AMPERE 128
3926
3993
  #define MMQ_Y_Q3_K_AMPERE 128
3927
3994
  #define NWARPS_Q3_K_AMPERE 4
3995
+ #endif
3928
3996
  #define MMQ_X_Q3_K_PASCAL 64
3929
3997
  #define MMQ_Y_Q3_K_PASCAL 64
3930
3998
  #define NWARPS_Q3_K_PASCAL 8
@@ -3985,9 +4053,15 @@ template <bool need_check> static __global__ void
3985
4053
  #define MMQ_X_Q4_K_RDNA1 32
3986
4054
  #define MMQ_Y_Q4_K_RDNA1 64
3987
4055
  #define NWARPS_Q4_K_RDNA1 8
4056
+ #if defined(CUDA_USE_TENSOR_CORES)
4057
+ #define MMQ_X_Q4_K_AMPERE 4
4058
+ #define MMQ_Y_Q4_K_AMPERE 32
4059
+ #define NWARPS_Q4_K_AMPERE 4
4060
+ #else
3988
4061
  #define MMQ_X_Q4_K_AMPERE 64
3989
4062
  #define MMQ_Y_Q4_K_AMPERE 128
3990
4063
  #define NWARPS_Q4_K_AMPERE 4
4064
+ #endif
3991
4065
  #define MMQ_X_Q4_K_PASCAL 64
3992
4066
  #define MMQ_Y_Q4_K_PASCAL 64
3993
4067
  #define NWARPS_Q4_K_PASCAL 8
@@ -4048,9 +4122,15 @@ template <bool need_check> static __global__ void
4048
4122
  #define MMQ_X_Q5_K_RDNA1 32
4049
4123
  #define MMQ_Y_Q5_K_RDNA1 64
4050
4124
  #define NWARPS_Q5_K_RDNA1 8
4125
+ #if defined(CUDA_USE_TENSOR_CORES)
4126
+ #define MMQ_X_Q5_K_AMPERE 4
4127
+ #define MMQ_Y_Q5_K_AMPERE 32
4128
+ #define NWARPS_Q5_K_AMPERE 4
4129
+ #else
4051
4130
  #define MMQ_X_Q5_K_AMPERE 64
4052
4131
  #define MMQ_Y_Q5_K_AMPERE 128
4053
4132
  #define NWARPS_Q5_K_AMPERE 4
4133
+ #endif
4054
4134
  #define MMQ_X_Q5_K_PASCAL 64
4055
4135
  #define MMQ_Y_Q5_K_PASCAL 64
4056
4136
  #define NWARPS_Q5_K_PASCAL 8
@@ -4109,9 +4189,15 @@ mul_mat_q5_K(
4109
4189
  #define MMQ_X_Q6_K_RDNA1 32
4110
4190
  #define MMQ_Y_Q6_K_RDNA1 64
4111
4191
  #define NWARPS_Q6_K_RDNA1 8
4192
+ #if defined(CUDA_USE_TENSOR_CORES)
4193
+ #define MMQ_X_Q6_K_AMPERE 4
4194
+ #define MMQ_Y_Q6_K_AMPERE 32
4195
+ #define NWARPS_Q6_K_AMPERE 4
4196
+ #else
4112
4197
  #define MMQ_X_Q6_K_AMPERE 64
4113
4198
  #define MMQ_Y_Q6_K_AMPERE 64
4114
4199
  #define NWARPS_Q6_K_AMPERE 4
4200
+ #endif
4115
4201
  #define MMQ_X_Q6_K_PASCAL 64
4116
4202
  #define MMQ_Y_Q6_K_PASCAL 64
4117
4203
  #define NWARPS_Q6_K_PASCAL 8
@@ -4407,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4407
4493
  cpy_1(cx + x_offset, cdst + dst_offset);
4408
4494
  }
4409
4495
 
4410
- // rope == RoPE == rotary positional embedding
4496
+ static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4497
+ const float y = (i0 / 2 - low) / max(0.001f, high - low);
4498
+ return 1.0f - min(1.0f, max(0.0f, y));
4499
+ }
4500
+
4501
+ struct rope_corr_dims {
4502
+ float v[4];
4503
+ };
4504
+
4505
+ // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
4506
+ // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
4507
+ static __device__ void rope_yarn(
4508
+ float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
4509
+ float * cos_theta, float * sin_theta
4510
+ ) {
4511
+ // Get n-d rotational scaling corrected for extrapolation
4512
+ float theta_interp = freq_scale * theta_extrap;
4513
+ float theta = theta_interp;
4514
+ if (ext_factor != 0.0f) {
4515
+ float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
4516
+ theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
4517
+
4518
+ // Get n-d magnitude scaling corrected for interpolation
4519
+ mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
4520
+ }
4521
+ *cos_theta = cosf(theta) * mscale;
4522
+ *sin_theta = sinf(theta) * mscale;
4523
+ }
4411
4524
 
4525
+ // rope == RoPE == rotary positional embedding
4412
4526
  template<typename T, bool has_pos>
4413
- static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4414
- const int p_delta_rows, const float theta_scale) {
4527
+ static __global__ void rope(
4528
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4529
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4530
+ ) {
4415
4531
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4416
4532
 
4417
4533
  if (col >= ncols) {
@@ -4423,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4423
4539
  const int i2 = row/p_delta_rows;
4424
4540
 
4425
4541
  const int p = has_pos ? pos[i2] : 0;
4426
- const float p0 = p*freq_scale;
4427
- const float theta = p0*powf(theta_scale, col/2);
4428
- const float sin_theta = sinf(theta);
4429
- const float cos_theta = cosf(theta);
4542
+ const float theta_base = p*powf(freq_base, -float(col)/ncols);
4543
+
4544
+ float cos_theta, sin_theta;
4545
+ rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
4430
4546
 
4431
4547
  const float x0 = x[i + 0];
4432
4548
  const float x1 = x[i + 1];
@@ -4436,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4436
4552
  }
4437
4553
 
4438
4554
  template<typename T, bool has_pos>
4439
- static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4440
- const int p_delta_rows, const float theta_scale) {
4555
+ static __global__ void rope_neox(
4556
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4557
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4558
+ ) {
4441
4559
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4442
4560
 
4443
4561
  if (col >= ncols) {
@@ -4448,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4448
4566
  const int i = row*ncols + col/2;
4449
4567
  const int i2 = row/p_delta_rows;
4450
4568
 
4569
+ // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4570
+ const float cur_rot = -float(col)/ncols;
4571
+
4451
4572
  const int p = has_pos ? pos[i2] : 0;
4452
- const float p0 = p*freq_scale;
4453
- const float theta = p0*powf(theta_scale, col/2);
4454
- const float sin_theta = sinf(theta);
4455
- const float cos_theta = cosf(theta);
4573
+ const float theta_base = p*powf(freq_base, cur_rot);
4574
+
4575
+ float cos_theta, sin_theta;
4576
+ rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4456
4577
 
4457
4578
  const float x0 = x[i + 0];
4458
4579
  const float x1 = x[i + ncols/2];
@@ -4461,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4461
4582
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4462
4583
  }
4463
4584
 
4464
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
4465
- const int p_delta_rows, const float theta_scale, const int n_ctx) {
4585
+ static __global__ void rope_glm_f32(
4586
+ const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4587
+ int n_ctx
4588
+ ) {
4466
4589
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4467
4590
  const int half_n_dims = ncols/4;
4468
4591
 
@@ -4474,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4474
4597
  const int i = row*ncols + col;
4475
4598
  const int i2 = row/p_delta_rows;
4476
4599
 
4477
- const float col_theta_scale = powf(theta_scale, col);
4600
+ const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
4478
4601
  // FIXME: this is likely wrong
4479
4602
  const int p = pos != nullptr ? pos[i2] : 0;
4480
4603
 
@@ -4616,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
4616
4739
  add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4617
4740
  }
4618
4741
 
4742
+ static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4743
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4744
+ add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4745
+ }
4746
+
4619
4747
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4620
4748
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4621
4749
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5493,40 +5621,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
5493
5621
  }
5494
5622
 
5495
5623
  template<typename T>
5496
- static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5497
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5624
+ static void rope_cuda(
5625
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5626
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5627
+ ) {
5498
5628
  GGML_ASSERT(ncols % 2 == 0);
5499
5629
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5500
5630
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5501
5631
  const dim3 block_nums(nrows, num_blocks_x, 1);
5502
5632
  if (pos == nullptr) {
5503
- rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5633
+ rope<T, false><<<block_nums, block_dims, 0, stream>>>(
5634
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5635
+ );
5504
5636
  } else {
5505
- rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5637
+ rope<T, true><<<block_nums, block_dims, 0, stream>>>(
5638
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5639
+ );
5506
5640
  }
5507
5641
  }
5508
5642
 
5509
5643
  template<typename T>
5510
- static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5511
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5644
+ static void rope_neox_cuda(
5645
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5646
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5647
+ ) {
5512
5648
  GGML_ASSERT(ncols % 2 == 0);
5513
5649
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5514
5650
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5515
5651
  const dim3 block_nums(nrows, num_blocks_x, 1);
5516
5652
  if (pos == nullptr) {
5517
- rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5653
+ rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5654
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5655
+ );
5518
5656
  } else {
5519
- rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5657
+ rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5658
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5659
+ );
5520
5660
  }
5521
5661
  }
5522
5662
 
5523
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5524
- const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5663
+ static void rope_glm_f32_cuda(
5664
+ const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5665
+ float freq_base, int n_ctx, cudaStream_t stream
5666
+ ) {
5525
5667
  GGML_ASSERT(ncols % 4 == 0);
5526
5668
  const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5527
5669
  const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
5528
5670
  const dim3 block_nums(num_blocks_x, nrows, 1);
5529
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
5671
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
5530
5672
  }
5531
5673
 
5532
5674
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5663,6 +5805,16 @@ void ggml_init_cublas() {
5663
5805
  CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5664
5806
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5665
5807
  int64_t total_vram = 0;
5808
+ #if defined(GGML_CUDA_FORCE_MMQ)
5809
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
5810
+ #else
5811
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
5812
+ #endif
5813
+ #if defined(CUDA_USE_TENSOR_CORES)
5814
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
5815
+ #else
5816
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
5817
+ #endif
5666
5818
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5667
5819
  for (int id = 0; id < g_device_count; ++id) {
5668
5820
  cudaDeviceProp prop;
@@ -5909,7 +6061,10 @@ inline void ggml_cuda_op_add(
5909
6061
  add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5910
6062
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5911
6063
  add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6064
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6065
+ add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
5912
6066
  } else {
6067
+ fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
5913
6068
  GGML_ASSERT(false);
5914
6069
  }
5915
6070
 
@@ -6347,7 +6502,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6347
6502
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6348
6503
  row_diff, src1_ncols, ne10,
6349
6504
  &alpha, src0_ddf_i, ne00,
6350
- src1_ddf_i, ne10,
6505
+ src1_ddf_i, ne10,
6351
6506
  &beta, dst_dd_i, ldc));
6352
6507
 
6353
6508
  if (src0_as != 0) {
@@ -6373,17 +6528,20 @@ inline void ggml_cuda_op_rope(
6373
6528
  const int64_t ne2 = dst->ne[2];
6374
6529
  const int64_t nrows = ggml_nrows(src0);
6375
6530
 
6376
- //const int n_past = ((int32_t *) dst->op_params)[0];
6377
- const int n_dims = ((int32_t *) dst->op_params)[1];
6378
- const int mode = ((int32_t *) dst->op_params)[2];
6379
- const int n_ctx = ((int32_t *) dst->op_params)[3];
6380
- // RoPE alteration for extended context
6381
-
6382
- float freq_base, freq_scale;
6383
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
6384
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
6531
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6532
+ const int n_dims = ((int32_t *) dst->op_params)[1];
6533
+ const int mode = ((int32_t *) dst->op_params)[2];
6534
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
6535
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
6385
6536
 
6386
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
6537
+ // RoPE alteration for extended context
6538
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
6539
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
6540
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
6541
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
6542
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
6543
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
6544
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
6387
6545
 
6388
6546
  const int32_t * pos = nullptr;
6389
6547
  if ((mode & 1) == 0) {
@@ -6395,24 +6553,39 @@ inline void ggml_cuda_op_rope(
6395
6553
  const bool is_neox = mode & 2;
6396
6554
  const bool is_glm = mode & 4;
6397
6555
 
6556
+ rope_corr_dims corr_dims;
6557
+ ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
6558
+
6398
6559
  // compute
6399
6560
  if (is_glm) {
6400
6561
  GGML_ASSERT(false);
6401
- rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6562
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6402
6563
  } else if (is_neox) {
6403
6564
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6404
6565
  if (src0->type == GGML_TYPE_F32) {
6405
- rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6566
+ rope_neox_cuda(
6567
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6568
+ attn_factor, corr_dims, main_stream
6569
+ );
6406
6570
  } else if (src0->type == GGML_TYPE_F16) {
6407
- rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6571
+ rope_neox_cuda(
6572
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6573
+ attn_factor, corr_dims, main_stream
6574
+ );
6408
6575
  } else {
6409
6576
  GGML_ASSERT(false);
6410
6577
  }
6411
6578
  } else {
6412
6579
  if (src0->type == GGML_TYPE_F32) {
6413
- rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6580
+ rope_cuda(
6581
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6582
+ attn_factor, corr_dims, main_stream
6583
+ );
6414
6584
  } else if (src0->type == GGML_TYPE_F16) {
6415
- rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6585
+ rope_cuda(
6586
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6587
+ attn_factor, corr_dims, main_stream
6588
+ );
6416
6589
  } else {
6417
6590
  GGML_ASSERT(false);
6418
6591
  }
@@ -6523,8 +6696,10 @@ inline void ggml_cuda_op_clamp(
6523
6696
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6524
6697
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6525
6698
 
6526
- const float min = ((float *) dst->op_params)[0];
6527
- const float max = ((float *) dst->op_params)[1];
6699
+ float min;
6700
+ float max;
6701
+ memcpy(&min, dst->op_params, sizeof(float));
6702
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
6528
6703
 
6529
6704
  clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6530
6705
  CUDA_CHECK(cudaGetLastError());
@@ -7048,9 +7223,34 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7048
7223
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7049
7224
  }
7050
7225
 
7051
- static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
7226
+ __global__ void k_compute_batched_ptrs(
7227
+ const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7228
+ void ** ptrs,
7229
+ int ne12, int ne13,
7230
+ int ne23,
7231
+ int nb02, int nb03,
7232
+ int nb12, int nb13,
7233
+ int nb2, int nb3,
7234
+ int r2, int r3) {
7235
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
7236
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
7237
+
7238
+ if (i13 >= ne13 || i12 >= ne12) {
7239
+ return;
7240
+ }
7241
+
7242
+ int i03 = i13 / r3;
7243
+ int i02 = i12 / r2;
7244
+
7245
+ ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
7246
+ ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7247
+ ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7248
+ }
7249
+
7250
+ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7052
7251
  GGML_ASSERT(!ggml_is_transposed(src0));
7053
7252
  GGML_ASSERT(!ggml_is_transposed(src1));
7253
+
7054
7254
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
7055
7255
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
7056
7256
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -7148,49 +7348,35 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7148
7348
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7149
7349
  } else {
7150
7350
  // use cublasGemmBatchedEx
7151
- // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
7152
7351
  const int ne23 = ne12*ne13;
7153
7352
 
7154
- // TODO: avoid this alloc
7155
- void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
7156
-
7157
- for (int i13 = 0; i13 < ne13; ++i13) {
7158
- for (int i12 = 0; i12 < ne12; ++i12) {
7159
- int i03 = i13 / r3;
7160
- int i02 = i12 / r2;
7161
-
7162
- ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
7163
- ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
7164
- ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
7165
- }
7166
- }
7167
-
7168
- // allocate device memory for pointers
7169
7353
  void ** ptrs_as = nullptr;
7170
- CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
7171
-
7172
- // TODO: this does not work for some reason -- not sure why?
7173
- //size_t ptrs_s = 0;
7174
- //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7175
-
7176
- // copy pointers to device
7177
- CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
7178
-
7179
- free(ptrs);
7354
+ size_t ptrs_s = 0;
7355
+ ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7356
+
7357
+ dim3 block_dims(ne13, ne12);
7358
+ k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7359
+ src0_as_f16, src1_as_f16, dst_f16,
7360
+ ptrs_as,
7361
+ ne12, ne13,
7362
+ ne23,
7363
+ nb02, nb03,
7364
+ nb12, nb13,
7365
+ dst->nb[2], dst->nb[3],
7366
+ r2, r3);
7367
+ CUDA_CHECK(cudaGetLastError());
7180
7368
 
7181
7369
  CUBLAS_CHECK(
7182
7370
  cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7183
7371
  ne01, ne11, ne10,
7184
- &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7185
- (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7186
- &beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7372
+ &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7373
+ (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7374
+ &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7187
7375
  ne23,
7188
7376
  CUBLAS_COMPUTE_16F,
7189
7377
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7190
7378
 
7191
- // free device memory for pointers
7192
- CUDA_CHECK(cudaFree(ptrs_as));
7193
- //ggml_cuda_pool_free(ptrs_as, ptrs_s);
7379
+ ggml_cuda_pool_free(ptrs_as, ptrs_s);
7194
7380
  }
7195
7381
  #endif
7196
7382
 
@@ -7202,17 +7388,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7202
7388
  }
7203
7389
 
7204
7390
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7205
- bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7206
- src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
7391
+ const bool all_on_device =
7392
+ (src0->backend == GGML_BACKEND_GPU) &&
7393
+ (src1->backend == GGML_BACKEND_GPU) &&
7394
+ ( dst->backend == GGML_BACKEND_GPU);
7207
7395
 
7208
7396
  int64_t min_compute_capability = INT_MAX;
7209
7397
  for (int64_t id = 0; id < g_device_count; ++id) {
7210
- if (min_compute_capability > g_compute_capabilities[id]
7211
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7398
+ if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7212
7399
  min_compute_capability = g_compute_capabilities[id];
7213
7400
  }
7214
7401
  }
7215
7402
 
7403
+ #ifdef CUDA_USE_TENSOR_CORES
7404
+ const bool use_tensor_cores = true;
7405
+ #else
7406
+ const bool use_tensor_cores = false;
7407
+ #endif
7408
+
7216
7409
  // debug helpers
7217
7410
  //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
7218
7411
  //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -7221,20 +7414,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7221
7414
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7222
7415
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7223
7416
 
7224
- if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7417
+ if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7225
7418
  // KQ single-batch
7226
7419
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7227
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7420
+ } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7228
7421
  // KQV single-batch
7229
7422
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7230
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
7423
+ } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7231
7424
  // KQ + KQV multi-batch
7232
7425
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7233
7426
  } else if (src0->type == GGML_TYPE_F32) {
7234
7427
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
7235
7428
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
7236
7429
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
7237
-
7238
7430
  #ifdef GGML_CUDA_FORCE_DMMV
7239
7431
  const bool use_mul_mat_vec_q = false;
7240
7432
  #else
@@ -7247,7 +7439,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7247
7439
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
7248
7440
  }
7249
7441
  } else {
7250
- if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
7442
+ bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
7443
+
7444
+ // when tensor cores are available, use them for large batch size
7445
+ // ref: https://github.com/ggerganov/llama.cpp/pull/3776
7446
+ if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
7447
+ use_mul_mat_q = false;
7448
+ }
7449
+
7450
+ if (use_mul_mat_q) {
7251
7451
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
7252
7452
  } else {
7253
7453
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -7601,10 +7801,6 @@ void ggml_cuda_set_main_device(const int main_device) {
7601
7801
  }
7602
7802
  }
7603
7803
 
7604
- void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7605
- g_mul_mat_q = mul_mat_q;
7606
- }
7607
-
7608
7804
  void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7609
7805
  // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7610
7806
  // it still won't always work as expected, but it's better than nothing