llama_cpp 0.9.0 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -87,6 +87,24 @@
87
87
  #define CC_OFFSET_AMD 1000000
88
88
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
89
89
 
90
+ // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
+ // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
+ // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
93
+ // - 7B quantum model: +100-200 MB
94
+ // - 13B quantum model: +200-400 MB
95
+ //
96
+ //#define GGML_CUDA_FORCE_MMQ
97
+
98
+ // TODO: improve this to be correct for more hardware
99
+ // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
100
+ // probably other such cases, and not sure what happens on AMD hardware
101
+ #if !defined(GGML_CUDA_FORCE_MMQ)
102
+ #define CUDA_USE_TENSOR_CORES
103
+ #endif
104
+
105
+ // max batch size to use MMQ kernels when tensor cores are available
106
+ #define MMQ_MAX_BATCH_SIZE 32
107
+
90
108
  #if defined(GGML_USE_HIPBLAS)
91
109
  #define __CUDA_ARCH__ 1300
92
110
 
@@ -470,7 +488,6 @@ static int g_device_count = -1;
470
488
  static int g_main_device = 0;
471
489
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
472
490
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
473
- static bool g_mul_mat_q = true;
474
491
 
475
492
  static void * g_scratch_buffer = nullptr;
476
493
  static size_t g_scratch_size = 0; // disabled by default
@@ -496,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
496
513
  dst[i] = __hadd(x[i], __float2half(y[i]));
497
514
  }
498
515
 
516
+ static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
517
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
518
+
519
+ if (i >= k) {
520
+ return;
521
+ }
522
+ dst[i] = __half2float(x[i]) + y[i];
523
+ }
524
+
499
525
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
500
526
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
501
527
 
@@ -956,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
956
982
 
957
983
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
958
984
 
959
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
985
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
960
986
  if (row > nrows) return;
961
987
 
962
988
  const int num_blocks_per_row = ncols / QK_K;
@@ -1060,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
1060
1086
 
1061
1087
  static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1062
1088
 
1063
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1089
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1064
1090
  if (row > nrows) return;
1065
1091
 
1066
1092
  const int num_blocks_per_row = ncols / QK_K;
@@ -1164,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
1164
1190
 
1165
1191
  static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1166
1192
 
1167
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1193
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1168
1194
  if (row > nrows) return;
1169
1195
  const int num_blocks_per_row = ncols / QK_K;
1170
1196
  const int ib0 = row*num_blocks_per_row;
@@ -1418,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
1418
1444
 
1419
1445
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1420
1446
 
1421
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1447
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1422
1448
  if (row > nrows) return;
1423
1449
 
1424
1450
  const int num_blocks_per_row = ncols / QK_K;
@@ -3554,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
3554
3580
  #define MMQ_X_Q4_0_RDNA1 64
3555
3581
  #define MMQ_Y_Q4_0_RDNA1 64
3556
3582
  #define NWARPS_Q4_0_RDNA1 8
3583
+ #if defined(CUDA_USE_TENSOR_CORES)
3584
+ #define MMQ_X_Q4_0_AMPERE 4
3585
+ #define MMQ_Y_Q4_0_AMPERE 32
3586
+ #define NWARPS_Q4_0_AMPERE 4
3587
+ #else
3557
3588
  #define MMQ_X_Q4_0_AMPERE 64
3558
3589
  #define MMQ_Y_Q4_0_AMPERE 128
3559
3590
  #define NWARPS_Q4_0_AMPERE 4
3591
+ #endif
3560
3592
  #define MMQ_X_Q4_0_PASCAL 64
3561
3593
  #define MMQ_Y_Q4_0_PASCAL 64
3562
3594
  #define NWARPS_Q4_0_PASCAL 8
@@ -3615,9 +3647,15 @@ template <bool need_check> static __global__ void
3615
3647
  #define MMQ_X_Q4_1_RDNA1 64
3616
3648
  #define MMQ_Y_Q4_1_RDNA1 64
3617
3649
  #define NWARPS_Q4_1_RDNA1 8
3650
+ #if defined(CUDA_USE_TENSOR_CORES)
3651
+ #define MMQ_X_Q4_1_AMPERE 4
3652
+ #define MMQ_Y_Q4_1_AMPERE 32
3653
+ #define NWARPS_Q4_1_AMPERE 4
3654
+ #else
3618
3655
  #define MMQ_X_Q4_1_AMPERE 64
3619
3656
  #define MMQ_Y_Q4_1_AMPERE 128
3620
3657
  #define NWARPS_Q4_1_AMPERE 4
3658
+ #endif
3621
3659
  #define MMQ_X_Q4_1_PASCAL 64
3622
3660
  #define MMQ_Y_Q4_1_PASCAL 64
3623
3661
  #define NWARPS_Q4_1_PASCAL 8
@@ -3678,9 +3716,15 @@ template <bool need_check> static __global__ void
3678
3716
  #define MMQ_X_Q5_0_RDNA1 64
3679
3717
  #define MMQ_Y_Q5_0_RDNA1 64
3680
3718
  #define NWARPS_Q5_0_RDNA1 8
3719
+ #if defined(CUDA_USE_TENSOR_CORES)
3720
+ #define MMQ_X_Q5_0_AMPERE 4
3721
+ #define MMQ_Y_Q5_0_AMPERE 32
3722
+ #define NWARPS_Q5_0_AMPERE 4
3723
+ #else
3681
3724
  #define MMQ_X_Q5_0_AMPERE 128
3682
3725
  #define MMQ_Y_Q5_0_AMPERE 64
3683
3726
  #define NWARPS_Q5_0_AMPERE 4
3727
+ #endif
3684
3728
  #define MMQ_X_Q5_0_PASCAL 64
3685
3729
  #define MMQ_Y_Q5_0_PASCAL 64
3686
3730
  #define NWARPS_Q5_0_PASCAL 8
@@ -3739,9 +3783,15 @@ template <bool need_check> static __global__ void
3739
3783
  #define MMQ_X_Q5_1_RDNA1 64
3740
3784
  #define MMQ_Y_Q5_1_RDNA1 64
3741
3785
  #define NWARPS_Q5_1_RDNA1 8
3786
+ #if defined(CUDA_USE_TENSOR_CORES)
3787
+ #define MMQ_X_Q5_1_AMPERE 4
3788
+ #define MMQ_Y_Q5_1_AMPERE 32
3789
+ #define NWARPS_Q5_1_AMPERE 4
3790
+ #else
3742
3791
  #define MMQ_X_Q5_1_AMPERE 128
3743
3792
  #define MMQ_Y_Q5_1_AMPERE 64
3744
3793
  #define NWARPS_Q5_1_AMPERE 4
3794
+ #endif
3745
3795
  #define MMQ_X_Q5_1_PASCAL 64
3746
3796
  #define MMQ_Y_Q5_1_PASCAL 64
3747
3797
  #define NWARPS_Q5_1_PASCAL 8
@@ -3800,9 +3850,15 @@ mul_mat_q5_1(
3800
3850
  #define MMQ_X_Q8_0_RDNA1 64
3801
3851
  #define MMQ_Y_Q8_0_RDNA1 64
3802
3852
  #define NWARPS_Q8_0_RDNA1 8
3853
+ #if defined(CUDA_USE_TENSOR_CORES)
3854
+ #define MMQ_X_Q8_0_AMPERE 4
3855
+ #define MMQ_Y_Q8_0_AMPERE 32
3856
+ #define NWARPS_Q8_0_AMPERE 4
3857
+ #else
3803
3858
  #define MMQ_X_Q8_0_AMPERE 128
3804
3859
  #define MMQ_Y_Q8_0_AMPERE 64
3805
3860
  #define NWARPS_Q8_0_AMPERE 4
3861
+ #endif
3806
3862
  #define MMQ_X_Q8_0_PASCAL 64
3807
3863
  #define MMQ_Y_Q8_0_PASCAL 64
3808
3864
  #define NWARPS_Q8_0_PASCAL 8
@@ -3861,9 +3917,15 @@ template <bool need_check> static __global__ void
3861
3917
  #define MMQ_X_Q2_K_RDNA1 128
3862
3918
  #define MMQ_Y_Q2_K_RDNA1 32
3863
3919
  #define NWARPS_Q2_K_RDNA1 8
3920
+ #if defined(CUDA_USE_TENSOR_CORES)
3921
+ #define MMQ_X_Q2_K_AMPERE 4
3922
+ #define MMQ_Y_Q2_K_AMPERE 32
3923
+ #define NWARPS_Q2_K_AMPERE 4
3924
+ #else
3864
3925
  #define MMQ_X_Q2_K_AMPERE 64
3865
3926
  #define MMQ_Y_Q2_K_AMPERE 128
3866
3927
  #define NWARPS_Q2_K_AMPERE 4
3928
+ #endif
3867
3929
  #define MMQ_X_Q2_K_PASCAL 64
3868
3930
  #define MMQ_Y_Q2_K_PASCAL 64
3869
3931
  #define NWARPS_Q2_K_PASCAL 8
@@ -3922,9 +3984,15 @@ mul_mat_q2_K(
3922
3984
  #define MMQ_X_Q3_K_RDNA1 32
3923
3985
  #define MMQ_Y_Q3_K_RDNA1 128
3924
3986
  #define NWARPS_Q3_K_RDNA1 8
3987
+ #if defined(CUDA_USE_TENSOR_CORES)
3988
+ #define MMQ_X_Q3_K_AMPERE 4
3989
+ #define MMQ_Y_Q3_K_AMPERE 32
3990
+ #define NWARPS_Q3_K_AMPERE 4
3991
+ #else
3925
3992
  #define MMQ_X_Q3_K_AMPERE 128
3926
3993
  #define MMQ_Y_Q3_K_AMPERE 128
3927
3994
  #define NWARPS_Q3_K_AMPERE 4
3995
+ #endif
3928
3996
  #define MMQ_X_Q3_K_PASCAL 64
3929
3997
  #define MMQ_Y_Q3_K_PASCAL 64
3930
3998
  #define NWARPS_Q3_K_PASCAL 8
@@ -3985,9 +4053,15 @@ template <bool need_check> static __global__ void
3985
4053
  #define MMQ_X_Q4_K_RDNA1 32
3986
4054
  #define MMQ_Y_Q4_K_RDNA1 64
3987
4055
  #define NWARPS_Q4_K_RDNA1 8
4056
+ #if defined(CUDA_USE_TENSOR_CORES)
4057
+ #define MMQ_X_Q4_K_AMPERE 4
4058
+ #define MMQ_Y_Q4_K_AMPERE 32
4059
+ #define NWARPS_Q4_K_AMPERE 4
4060
+ #else
3988
4061
  #define MMQ_X_Q4_K_AMPERE 64
3989
4062
  #define MMQ_Y_Q4_K_AMPERE 128
3990
4063
  #define NWARPS_Q4_K_AMPERE 4
4064
+ #endif
3991
4065
  #define MMQ_X_Q4_K_PASCAL 64
3992
4066
  #define MMQ_Y_Q4_K_PASCAL 64
3993
4067
  #define NWARPS_Q4_K_PASCAL 8
@@ -4048,9 +4122,15 @@ template <bool need_check> static __global__ void
4048
4122
  #define MMQ_X_Q5_K_RDNA1 32
4049
4123
  #define MMQ_Y_Q5_K_RDNA1 64
4050
4124
  #define NWARPS_Q5_K_RDNA1 8
4125
+ #if defined(CUDA_USE_TENSOR_CORES)
4126
+ #define MMQ_X_Q5_K_AMPERE 4
4127
+ #define MMQ_Y_Q5_K_AMPERE 32
4128
+ #define NWARPS_Q5_K_AMPERE 4
4129
+ #else
4051
4130
  #define MMQ_X_Q5_K_AMPERE 64
4052
4131
  #define MMQ_Y_Q5_K_AMPERE 128
4053
4132
  #define NWARPS_Q5_K_AMPERE 4
4133
+ #endif
4054
4134
  #define MMQ_X_Q5_K_PASCAL 64
4055
4135
  #define MMQ_Y_Q5_K_PASCAL 64
4056
4136
  #define NWARPS_Q5_K_PASCAL 8
@@ -4109,9 +4189,15 @@ mul_mat_q5_K(
4109
4189
  #define MMQ_X_Q6_K_RDNA1 32
4110
4190
  #define MMQ_Y_Q6_K_RDNA1 64
4111
4191
  #define NWARPS_Q6_K_RDNA1 8
4192
+ #if defined(CUDA_USE_TENSOR_CORES)
4193
+ #define MMQ_X_Q6_K_AMPERE 4
4194
+ #define MMQ_Y_Q6_K_AMPERE 32
4195
+ #define NWARPS_Q6_K_AMPERE 4
4196
+ #else
4112
4197
  #define MMQ_X_Q6_K_AMPERE 64
4113
4198
  #define MMQ_Y_Q6_K_AMPERE 64
4114
4199
  #define NWARPS_Q6_K_AMPERE 4
4200
+ #endif
4115
4201
  #define MMQ_X_Q6_K_PASCAL 64
4116
4202
  #define MMQ_Y_Q6_K_PASCAL 64
4117
4203
  #define NWARPS_Q6_K_PASCAL 8
@@ -4168,7 +4254,7 @@ template <bool need_check> static __global__ void
4168
4254
 
4169
4255
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
4170
4256
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
4171
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4257
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4172
4258
 
4173
4259
  if (row >= nrows) {
4174
4260
  return;
@@ -4208,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
4208
4294
  static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
4209
4295
  // qk = quantized weights per x block
4210
4296
  // qr = number of quantized weights per data value in x block
4211
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4297
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4212
4298
 
4213
4299
  if (row >= nrows) {
4214
4300
  return;
@@ -4407,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4407
4493
  cpy_1(cx + x_offset, cdst + dst_offset);
4408
4494
  }
4409
4495
 
4410
- // rope == RoPE == rotary positional embedding
4496
+ static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4497
+ const float y = (i0 / 2 - low) / max(0.001f, high - low);
4498
+ return 1.0f - min(1.0f, max(0.0f, y));
4499
+ }
4500
+
4501
+ struct rope_corr_dims {
4502
+ float v[4];
4503
+ };
4504
+
4505
+ // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
4506
+ // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
4507
+ static __device__ void rope_yarn(
4508
+ float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
4509
+ float * cos_theta, float * sin_theta
4510
+ ) {
4511
+ // Get n-d rotational scaling corrected for extrapolation
4512
+ float theta_interp = freq_scale * theta_extrap;
4513
+ float theta = theta_interp;
4514
+ if (ext_factor != 0.0f) {
4515
+ float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
4516
+ theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
4411
4517
 
4518
+ // Get n-d magnitude scaling corrected for interpolation
4519
+ mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
4520
+ }
4521
+ *cos_theta = cosf(theta) * mscale;
4522
+ *sin_theta = sinf(theta) * mscale;
4523
+ }
4524
+
4525
+ // rope == RoPE == rotary positional embedding
4412
4526
  template<typename T, bool has_pos>
4413
- static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4414
- const int p_delta_rows, const float theta_scale) {
4527
+ static __global__ void rope(
4528
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4529
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4530
+ ) {
4415
4531
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4416
4532
 
4417
4533
  if (col >= ncols) {
@@ -4423,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4423
4539
  const int i2 = row/p_delta_rows;
4424
4540
 
4425
4541
  const int p = has_pos ? pos[i2] : 0;
4426
- const float p0 = p*freq_scale;
4427
- const float theta = p0*powf(theta_scale, col/2);
4428
- const float sin_theta = sinf(theta);
4429
- const float cos_theta = cosf(theta);
4542
+ const float theta_base = p*powf(freq_base, -float(col)/ncols);
4543
+
4544
+ float cos_theta, sin_theta;
4545
+ rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
4430
4546
 
4431
4547
  const float x0 = x[i + 0];
4432
4548
  const float x1 = x[i + 1];
@@ -4436,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4436
4552
  }
4437
4553
 
4438
4554
  template<typename T, bool has_pos>
4439
- static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4440
- const int p_delta_rows, const float theta_scale) {
4555
+ static __global__ void rope_neox(
4556
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4557
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4558
+ ) {
4441
4559
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4442
4560
 
4443
4561
  if (col >= ncols) {
@@ -4448,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4448
4566
  const int i = row*ncols + col/2;
4449
4567
  const int i2 = row/p_delta_rows;
4450
4568
 
4569
+ // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4570
+ const float cur_rot = -float(col)/ncols;
4571
+
4451
4572
  const int p = has_pos ? pos[i2] : 0;
4452
- const float p0 = p*freq_scale;
4453
- const float theta = p0*powf(theta_scale, col/2);
4454
- const float sin_theta = sinf(theta);
4455
- const float cos_theta = cosf(theta);
4573
+ const float theta_base = p*powf(freq_base, cur_rot);
4574
+
4575
+ float cos_theta, sin_theta;
4576
+ rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4456
4577
 
4457
4578
  const float x0 = x[i + 0];
4458
4579
  const float x1 = x[i + ncols/2];
@@ -4461,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4461
4582
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4462
4583
  }
4463
4584
 
4464
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
4465
- const int p_delta_rows, const float theta_scale, const int n_ctx) {
4585
+ static __global__ void rope_glm_f32(
4586
+ const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4587
+ int n_ctx
4588
+ ) {
4466
4589
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4467
4590
  const int half_n_dims = ncols/4;
4468
4591
 
@@ -4474,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4474
4597
  const int i = row*ncols + col;
4475
4598
  const int i2 = row/p_delta_rows;
4476
4599
 
4477
- const float col_theta_scale = powf(theta_scale, col);
4600
+ const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
4478
4601
  // FIXME: this is likely wrong
4479
4602
  const int p = pos != nullptr ? pos[i2] : 0;
4480
4603
 
@@ -4616,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
4616
4739
  add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4617
4740
  }
4618
4741
 
4742
+ static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4743
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4744
+ add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4745
+ }
4746
+
4619
4747
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4620
4748
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4621
4749
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -4739,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4739
4867
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4740
4868
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4741
4869
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4742
- const dim3 block_nums(1, block_num_y, 1);
4870
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
4871
+ const dim3 block_nums(block_num_y, 1, 1);
4743
4872
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4744
4873
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
4745
4874
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4748,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
4748
4877
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4749
4878
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4750
4879
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4751
- const dim3 block_nums(1, block_num_y, 1);
4880
+ const dim3 block_nums(block_num_y, 1, 1);
4752
4881
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4753
4882
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
4754
4883
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4757,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
4757
4886
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4758
4887
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4759
4888
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4760
- const dim3 block_nums(1, block_num_y, 1);
4889
+ const dim3 block_nums(block_num_y, 1, 1);
4761
4890
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4762
4891
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
4763
4892
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4766,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
4766
4895
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4767
4896
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4768
4897
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4769
- const dim3 block_nums(1, block_num_y, 1);
4898
+ const dim3 block_nums(block_num_y, 1, 1);
4770
4899
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4771
4900
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
4772
4901
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4775,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
4775
4904
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4776
4905
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4777
4906
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4778
- const dim3 block_nums(1, block_num_y, 1);
4907
+ const dim3 block_nums(block_num_y, 1, 1);
4779
4908
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4780
4909
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
4781
4910
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4785,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
4785
4914
  GGML_ASSERT(ncols % QK_K == 0);
4786
4915
  const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
4787
4916
  const int block_num_y = (nrows + ny - 1) / ny;
4788
- const dim3 block_nums(1, block_num_y, 1);
4917
+ const dim3 block_nums(block_num_y, 1, 1);
4789
4918
  const dim3 block_dims(32, ny, 1);
4790
4919
  dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4791
4920
  }
@@ -4794,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
4794
4923
  GGML_ASSERT(ncols % QK_K == 0);
4795
4924
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4796
4925
  const int block_num_y = (nrows + ny - 1) / ny;
4797
- const dim3 block_nums(1, block_num_y, 1);
4926
+ const dim3 block_nums(block_num_y, 1, 1);
4798
4927
  const dim3 block_dims(32, ny, 1);
4799
4928
  dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4800
4929
  }
@@ -4803,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
4803
4932
  GGML_ASSERT(ncols % QK_K == 0);
4804
4933
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4805
4934
  const int block_num_y = (nrows + ny - 1) / ny;
4806
- const dim3 block_nums(1, block_num_y, 1);
4935
+ const dim3 block_nums(block_num_y, 1, 1);
4807
4936
  const dim3 block_dims(32, ny, 1);
4808
4937
  dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4809
4938
  }
@@ -4818,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4818
4947
  GGML_ASSERT(ncols % QK_K == 0);
4819
4948
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4820
4949
  const int block_num_y = (nrows + ny - 1) / ny;
4821
- const dim3 block_nums(1, block_num_y, 1);
4950
+ const dim3 block_nums(block_num_y, 1, 1);
4822
4951
  const dim3 block_dims(32, ny, 1);
4823
4952
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4824
4953
  }
@@ -4826,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4826
4955
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4827
4956
  GGML_ASSERT(ncols % QK4_0 == 0);
4828
4957
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4829
- const dim3 block_nums(1, block_num_y, 1);
4958
+ const dim3 block_nums(block_num_y, 1, 1);
4830
4959
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4831
4960
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
4832
4961
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4835,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
4835
4964
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4836
4965
  GGML_ASSERT(ncols % QK4_1 == 0);
4837
4966
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4838
- const dim3 block_nums(1, block_num_y, 1);
4967
+ const dim3 block_nums(block_num_y, 1, 1);
4839
4968
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4840
4969
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
4841
4970
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4844,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
4844
4973
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4845
4974
  GGML_ASSERT(ncols % QK5_0 == 0);
4846
4975
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4847
- const dim3 block_nums(1, block_num_y, 1);
4976
+ const dim3 block_nums(block_num_y, 1, 1);
4848
4977
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4849
4978
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
4850
4979
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4853,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
4853
4982
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4854
4983
  GGML_ASSERT(ncols % QK5_1 == 0);
4855
4984
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4856
- const dim3 block_nums(1, block_num_y, 1);
4985
+ const dim3 block_nums(block_num_y, 1, 1);
4857
4986
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4858
4987
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
4859
4988
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4862,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
4862
4991
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4863
4992
  GGML_ASSERT(ncols % QK8_0 == 0);
4864
4993
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4865
- const dim3 block_nums(1, block_num_y, 1);
4994
+ const dim3 block_nums(block_num_y, 1, 1);
4866
4995
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4867
4996
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
4868
4997
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4871,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
4871
5000
  static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4872
5001
  GGML_ASSERT(ncols % QK_K == 0);
4873
5002
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4874
- const dim3 block_nums(1, block_num_y, 1);
5003
+ const dim3 block_nums(block_num_y, 1, 1);
4875
5004
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4876
5005
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
4877
5006
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4880,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
4880
5009
  static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4881
5010
  GGML_ASSERT(ncols % QK_K == 0);
4882
5011
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4883
- const dim3 block_nums(1, block_num_y, 1);
5012
+ const dim3 block_nums(block_num_y, 1, 1);
4884
5013
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4885
5014
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
4886
5015
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4889,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
4889
5018
  static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4890
5019
  GGML_ASSERT(ncols % QK_K == 0);
4891
5020
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4892
- const dim3 block_nums(1, block_num_y, 1);
5021
+ const dim3 block_nums(block_num_y, 1, 1);
4893
5022
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4894
5023
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
4895
5024
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4898,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
4898
5027
  static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4899
5028
  GGML_ASSERT(ncols % QK_K == 0);
4900
5029
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4901
- const dim3 block_nums(1, block_num_y, 1);
5030
+ const dim3 block_nums(block_num_y, 1, 1);
4902
5031
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4903
5032
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
4904
5033
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4907,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
4907
5036
  static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4908
5037
  GGML_ASSERT(ncols % QK_K == 0);
4909
5038
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4910
- const dim3 block_nums(1, block_num_y, 1);
5039
+ const dim3 block_nums(block_num_y, 1, 1);
4911
5040
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4912
5041
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
4913
5042
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4926,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
4926
5055
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4927
5056
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4928
5057
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4929
- const dim3 block_nums(1, block_num_y, 1);
5058
+ const dim3 block_nums(block_num_y, 1, 1);
4930
5059
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4931
5060
  dequantize_mul_mat_vec<1, 1, convert_f16>
4932
5061
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -5493,40 +5622,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
5493
5622
  }
5494
5623
 
5495
5624
  template<typename T>
5496
- static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5497
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5625
+ static void rope_cuda(
5626
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5627
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5628
+ ) {
5498
5629
  GGML_ASSERT(ncols % 2 == 0);
5499
5630
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5500
5631
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5501
5632
  const dim3 block_nums(nrows, num_blocks_x, 1);
5502
5633
  if (pos == nullptr) {
5503
- rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5634
+ rope<T, false><<<block_nums, block_dims, 0, stream>>>(
5635
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5636
+ );
5504
5637
  } else {
5505
- rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5638
+ rope<T, true><<<block_nums, block_dims, 0, stream>>>(
5639
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5640
+ );
5506
5641
  }
5507
5642
  }
5508
5643
 
5509
5644
  template<typename T>
5510
- static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5511
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5645
+ static void rope_neox_cuda(
5646
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5647
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5648
+ ) {
5512
5649
  GGML_ASSERT(ncols % 2 == 0);
5513
5650
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5514
5651
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5515
5652
  const dim3 block_nums(nrows, num_blocks_x, 1);
5516
5653
  if (pos == nullptr) {
5517
- rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5654
+ rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5655
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5656
+ );
5518
5657
  } else {
5519
- rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5658
+ rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5659
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5660
+ );
5520
5661
  }
5521
5662
  }
5522
5663
 
5523
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5524
- const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5664
+ static void rope_glm_f32_cuda(
5665
+ const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5666
+ float freq_base, int n_ctx, cudaStream_t stream
5667
+ ) {
5525
5668
  GGML_ASSERT(ncols % 4 == 0);
5526
5669
  const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5527
5670
  const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
5528
5671
  const dim3 block_nums(num_blocks_x, nrows, 1);
5529
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
5672
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
5530
5673
  }
5531
5674
 
5532
5675
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5647,6 +5790,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
5647
5790
  CUDA_CHECK(cudaFree(ptr));
5648
5791
  }
5649
5792
 
5793
+ static bool g_cublas_loaded = false;
5794
+
5795
+ bool ggml_cublas_loaded(void) {
5796
+ return g_cublas_loaded;
5797
+ }
5650
5798
 
5651
5799
  void ggml_init_cublas() {
5652
5800
  static bool initialized = false;
@@ -5660,9 +5808,24 @@ void ggml_init_cublas() {
5660
5808
  CUDA_CHECK(cudaDeviceSynchronize());
5661
5809
  #endif
5662
5810
 
5663
- CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5811
+ if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
5812
+ initialized = true;
5813
+ g_cublas_loaded = false;
5814
+ return;
5815
+ }
5816
+
5664
5817
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5665
5818
  int64_t total_vram = 0;
5819
+ #if defined(GGML_CUDA_FORCE_MMQ)
5820
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
5821
+ #else
5822
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
5823
+ #endif
5824
+ #if defined(CUDA_USE_TENSOR_CORES)
5825
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
5826
+ #else
5827
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
5828
+ #endif
5666
5829
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5667
5830
  for (int id = 0; id < g_device_count; ++id) {
5668
5831
  cudaDeviceProp prop;
@@ -5698,6 +5861,7 @@ void ggml_init_cublas() {
5698
5861
  // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
5699
5862
 
5700
5863
  initialized = true;
5864
+ g_cublas_loaded = true;
5701
5865
  }
5702
5866
  }
5703
5867
 
@@ -5909,7 +6073,10 @@ inline void ggml_cuda_op_add(
5909
6073
  add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5910
6074
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5911
6075
  add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6076
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6077
+ add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
5912
6078
  } else {
6079
+ fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
5913
6080
  GGML_ASSERT(false);
5914
6081
  }
5915
6082
 
@@ -6347,7 +6514,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6347
6514
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6348
6515
  row_diff, src1_ncols, ne10,
6349
6516
  &alpha, src0_ddf_i, ne00,
6350
- src1_ddf_i, ne10,
6517
+ src1_ddf_i, ne10,
6351
6518
  &beta, dst_dd_i, ldc));
6352
6519
 
6353
6520
  if (src0_as != 0) {
@@ -6373,17 +6540,20 @@ inline void ggml_cuda_op_rope(
6373
6540
  const int64_t ne2 = dst->ne[2];
6374
6541
  const int64_t nrows = ggml_nrows(src0);
6375
6542
 
6376
- //const int n_past = ((int32_t *) dst->op_params)[0];
6377
- const int n_dims = ((int32_t *) dst->op_params)[1];
6378
- const int mode = ((int32_t *) dst->op_params)[2];
6379
- const int n_ctx = ((int32_t *) dst->op_params)[3];
6380
- // RoPE alteration for extended context
6381
-
6382
- float freq_base, freq_scale;
6383
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
6384
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
6543
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6544
+ const int n_dims = ((int32_t *) dst->op_params)[1];
6545
+ const int mode = ((int32_t *) dst->op_params)[2];
6546
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
6547
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
6385
6548
 
6386
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
6549
+ // RoPE alteration for extended context
6550
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
6551
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
6552
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
6553
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
6554
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
6555
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
6556
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
6387
6557
 
6388
6558
  const int32_t * pos = nullptr;
6389
6559
  if ((mode & 1) == 0) {
@@ -6395,24 +6565,39 @@ inline void ggml_cuda_op_rope(
6395
6565
  const bool is_neox = mode & 2;
6396
6566
  const bool is_glm = mode & 4;
6397
6567
 
6568
+ rope_corr_dims corr_dims;
6569
+ ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
6570
+
6398
6571
  // compute
6399
6572
  if (is_glm) {
6400
6573
  GGML_ASSERT(false);
6401
- rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6574
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6402
6575
  } else if (is_neox) {
6403
6576
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6404
6577
  if (src0->type == GGML_TYPE_F32) {
6405
- rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6578
+ rope_neox_cuda(
6579
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6580
+ attn_factor, corr_dims, main_stream
6581
+ );
6406
6582
  } else if (src0->type == GGML_TYPE_F16) {
6407
- rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6583
+ rope_neox_cuda(
6584
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6585
+ attn_factor, corr_dims, main_stream
6586
+ );
6408
6587
  } else {
6409
6588
  GGML_ASSERT(false);
6410
6589
  }
6411
6590
  } else {
6412
6591
  if (src0->type == GGML_TYPE_F32) {
6413
- rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6592
+ rope_cuda(
6593
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6594
+ attn_factor, corr_dims, main_stream
6595
+ );
6414
6596
  } else if (src0->type == GGML_TYPE_F16) {
6415
- rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6597
+ rope_cuda(
6598
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6599
+ attn_factor, corr_dims, main_stream
6600
+ );
6416
6601
  } else {
6417
6602
  GGML_ASSERT(false);
6418
6603
  }
@@ -6523,8 +6708,10 @@ inline void ggml_cuda_op_clamp(
6523
6708
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6524
6709
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6525
6710
 
6526
- const float min = ((float *) dst->op_params)[0];
6527
- const float max = ((float *) dst->op_params)[1];
6711
+ float min;
6712
+ float max;
6713
+ memcpy(&min, dst->op_params, sizeof(float));
6714
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
6528
6715
 
6529
6716
  clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6530
6717
  CUDA_CHECK(cudaGetLastError());
@@ -6717,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
6717
6904
  int64_t row_low[GGML_CUDA_MAX_DEVICES];
6718
6905
  int64_t row_high[GGML_CUDA_MAX_DEVICES];
6719
6906
 
6907
+ int used_devices = 0;
6908
+
6720
6909
  for (int64_t id = 0; id < g_device_count; ++id) {
6721
6910
  // by default, use all rows
6722
6911
  row_low[id] = 0;
@@ -6744,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
6744
6933
  continue;
6745
6934
  }
6746
6935
 
6936
+ used_devices++;
6937
+
6747
6938
  const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6748
6939
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6749
6940
 
@@ -6782,12 +6973,12 @@ static void ggml_cuda_op_mul_mat(
6782
6973
 
6783
6974
  // if multiple devices are used they need to wait for the main device
6784
6975
  // here an event is recorded that signals that the main device has finished calculating the input data
6785
- if (split && g_device_count > 1) {
6976
+ if (split && used_devices > 1) {
6786
6977
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6787
6978
  CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6788
6979
  }
6789
6980
 
6790
- const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6981
+ const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6791
6982
  for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6792
6983
  const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6793
6984
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@@ -6903,6 +7094,9 @@ static void ggml_cuda_op_mul_mat(
6903
7094
  }
6904
7095
 
6905
7096
  for (int64_t id = 0; id < g_device_count; ++id) {
7097
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
7098
+ continue;
7099
+ }
6906
7100
  CUDA_CHECK(ggml_cuda_set_device(id));
6907
7101
 
6908
7102
  // free buffers again when done
@@ -6927,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(
6927
7121
 
6928
7122
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6929
7123
  for (int64_t id = 0; id < g_device_count; ++id) {
7124
+ if (row_low[id] == row_high[id]) {
7125
+ continue;
7126
+ }
6930
7127
  for (int64_t is = 0; is < is_max; ++is) {
6931
7128
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
6932
7129
  }
@@ -6972,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
6972
7169
  }
6973
7170
 
6974
7171
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7172
+ if (!g_cublas_loaded) return false;
7173
+
6975
7174
  const int64_t ne10 = src1->ne[0];
6976
7175
 
6977
7176
  const int64_t ne0 = dst->ne[0];
@@ -7048,9 +7247,34 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7048
7247
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7049
7248
  }
7050
7249
 
7051
- static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
7250
+ __global__ void k_compute_batched_ptrs(
7251
+ const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7252
+ const void ** ptrs_src, void ** ptrs_dst,
7253
+ int ne12, int ne13,
7254
+ int ne23,
7255
+ int nb02, int nb03,
7256
+ int nb12, int nb13,
7257
+ int nb2, int nb3,
7258
+ int r2, int r3) {
7259
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
7260
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
7261
+
7262
+ if (i13 >= ne13 || i12 >= ne12) {
7263
+ return;
7264
+ }
7265
+
7266
+ int i03 = i13 / r3;
7267
+ int i02 = i12 / r2;
7268
+
7269
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
7270
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7271
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7272
+ }
7273
+
7274
+ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7052
7275
  GGML_ASSERT(!ggml_is_transposed(src0));
7053
7276
  GGML_ASSERT(!ggml_is_transposed(src1));
7277
+
7054
7278
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
7055
7279
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
7056
7280
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -7148,49 +7372,45 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7148
7372
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7149
7373
  } else {
7150
7374
  // use cublasGemmBatchedEx
7151
- // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
7152
7375
  const int ne23 = ne12*ne13;
7153
7376
 
7154
- // TODO: avoid this alloc
7155
- void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
7156
-
7157
- for (int i13 = 0; i13 < ne13; ++i13) {
7158
- for (int i12 = 0; i12 < ne12; ++i12) {
7159
- int i03 = i13 / r3;
7160
- int i02 = i12 / r2;
7161
-
7162
- ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
7163
- ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
7164
- ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
7165
- }
7166
- }
7167
-
7168
- // allocate device memory for pointers
7169
- void ** ptrs_as = nullptr;
7170
- CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
7377
+ const void ** ptrs_src = nullptr;
7378
+ void ** ptrs_dst = nullptr;
7171
7379
 
7172
- // TODO: this does not work for some reason -- not sure why?
7173
- //size_t ptrs_s = 0;
7174
- //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7380
+ size_t ptrs_src_s = 0;
7381
+ size_t ptrs_dst_s = 0;
7175
7382
 
7176
- // copy pointers to device
7177
- CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
7383
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
7384
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
7178
7385
 
7179
- free(ptrs);
7386
+ dim3 block_dims(ne13, ne12);
7387
+ k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7388
+ src0_as_f16, src1_as_f16, dst_f16,
7389
+ ptrs_src, ptrs_dst,
7390
+ ne12, ne13,
7391
+ ne23,
7392
+ nb02, nb03,
7393
+ nb12, nb13,
7394
+ dst->nb[2], dst->nb[3],
7395
+ r2, r3);
7396
+ CUDA_CHECK(cudaGetLastError());
7180
7397
 
7181
7398
  CUBLAS_CHECK(
7182
7399
  cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7183
7400
  ne01, ne11, ne10,
7184
- &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7185
- (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7186
- &beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7401
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7402
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7403
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
7187
7404
  ne23,
7188
7405
  CUBLAS_COMPUTE_16F,
7189
7406
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7190
7407
 
7191
- // free device memory for pointers
7192
- CUDA_CHECK(cudaFree(ptrs_as));
7193
- //ggml_cuda_pool_free(ptrs_as, ptrs_s);
7408
+ if (ptrs_src_s != 0) {
7409
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
7410
+ }
7411
+ if (ptrs_dst_s != 0) {
7412
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
7413
+ }
7194
7414
  }
7195
7415
  #endif
7196
7416
 
@@ -7202,17 +7422,26 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7202
7422
  }
7203
7423
 
7204
7424
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7205
- bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7206
- src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
7425
+ const bool all_on_device =
7426
+ (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7427
+ (src1->backend == GGML_BACKEND_GPU) &&
7428
+ ( dst->backend == GGML_BACKEND_GPU);
7429
+
7430
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7207
7431
 
7208
7432
  int64_t min_compute_capability = INT_MAX;
7209
7433
  for (int64_t id = 0; id < g_device_count; ++id) {
7210
- if (min_compute_capability > g_compute_capabilities[id]
7211
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7434
+ if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7212
7435
  min_compute_capability = g_compute_capabilities[id];
7213
7436
  }
7214
7437
  }
7215
7438
 
7439
+ #ifdef CUDA_USE_TENSOR_CORES
7440
+ const bool use_tensor_cores = true;
7441
+ #else
7442
+ const bool use_tensor_cores = false;
7443
+ #endif
7444
+
7216
7445
  // debug helpers
7217
7446
  //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
7218
7447
  //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -7221,20 +7450,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7221
7450
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7222
7451
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7223
7452
 
7224
- if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7453
+ if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7225
7454
  // KQ single-batch
7226
7455
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7227
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7456
+ } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7228
7457
  // KQV single-batch
7229
7458
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7230
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
7459
+ } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7231
7460
  // KQ + KQV multi-batch
7232
7461
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7233
7462
  } else if (src0->type == GGML_TYPE_F32) {
7234
7463
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
7235
7464
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
7236
7465
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
7237
-
7238
7466
  #ifdef GGML_CUDA_FORCE_DMMV
7239
7467
  const bool use_mul_mat_vec_q = false;
7240
7468
  #else
@@ -7247,7 +7475,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7247
7475
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
7248
7476
  }
7249
7477
  } else {
7250
- if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
7478
+ bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
7479
+
7480
+ // when tensor cores are available, use them for large batch size
7481
+ // ref: https://github.com/ggerganov/llama.cpp/pull/3776
7482
+ if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
7483
+ use_mul_mat_q = false;
7484
+ }
7485
+
7486
+ if (use_mul_mat_q) {
7251
7487
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
7252
7488
  } else {
7253
7489
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -7601,10 +7837,6 @@ void ggml_cuda_set_main_device(const int main_device) {
7601
7837
  }
7602
7838
  }
7603
7839
 
7604
- void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7605
- g_mul_mat_q = mul_mat_q;
7606
- }
7607
-
7608
7840
  void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7609
7841
  // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7610
7842
  // it still won't always work as expected, but it's better than nothing
@@ -7624,6 +7856,8 @@ void ggml_cuda_free_scratch() {
7624
7856
  }
7625
7857
 
7626
7858
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7859
+ if (!g_cublas_loaded) return false;
7860
+
7627
7861
  ggml_cuda_func_t func;
7628
7862
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7629
7863
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))