llama_cpp 0.9.0 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -87,6 +87,24 @@
87
87
  #define CC_OFFSET_AMD 1000000
88
88
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
89
89
 
90
+ // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
+ // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
+ // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
93
+ // - 7B quantum model: +100-200 MB
94
+ // - 13B quantum model: +200-400 MB
95
+ //
96
+ //#define GGML_CUDA_FORCE_MMQ
97
+
98
+ // TODO: improve this to be correct for more hardware
99
+ // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
100
+ // probably other such cases, and not sure what happens on AMD hardware
101
+ #if !defined(GGML_CUDA_FORCE_MMQ)
102
+ #define CUDA_USE_TENSOR_CORES
103
+ #endif
104
+
105
+ // max batch size to use MMQ kernels when tensor cores are available
106
+ #define MMQ_MAX_BATCH_SIZE 32
107
+
90
108
  #if defined(GGML_USE_HIPBLAS)
91
109
  #define __CUDA_ARCH__ 1300
92
110
 
@@ -470,7 +488,6 @@ static int g_device_count = -1;
470
488
  static int g_main_device = 0;
471
489
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
472
490
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
473
- static bool g_mul_mat_q = true;
474
491
 
475
492
  static void * g_scratch_buffer = nullptr;
476
493
  static size_t g_scratch_size = 0; // disabled by default
@@ -496,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
496
513
  dst[i] = __hadd(x[i], __float2half(y[i]));
497
514
  }
498
515
 
516
+ static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
517
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
518
+
519
+ if (i >= k) {
520
+ return;
521
+ }
522
+ dst[i] = __half2float(x[i]) + y[i];
523
+ }
524
+
499
525
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
500
526
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
501
527
 
@@ -956,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
956
982
 
957
983
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
958
984
 
959
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
985
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
960
986
  if (row > nrows) return;
961
987
 
962
988
  const int num_blocks_per_row = ncols / QK_K;
@@ -1060,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
1060
1086
 
1061
1087
  static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1062
1088
 
1063
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1089
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1064
1090
  if (row > nrows) return;
1065
1091
 
1066
1092
  const int num_blocks_per_row = ncols / QK_K;
@@ -1164,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
1164
1190
 
1165
1191
  static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1166
1192
 
1167
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1193
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1168
1194
  if (row > nrows) return;
1169
1195
  const int num_blocks_per_row = ncols / QK_K;
1170
1196
  const int ib0 = row*num_blocks_per_row;
@@ -1418,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
1418
1444
 
1419
1445
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1420
1446
 
1421
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
1447
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
1422
1448
  if (row > nrows) return;
1423
1449
 
1424
1450
  const int num_blocks_per_row = ncols / QK_K;
@@ -3554,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
3554
3580
  #define MMQ_X_Q4_0_RDNA1 64
3555
3581
  #define MMQ_Y_Q4_0_RDNA1 64
3556
3582
  #define NWARPS_Q4_0_RDNA1 8
3583
+ #if defined(CUDA_USE_TENSOR_CORES)
3584
+ #define MMQ_X_Q4_0_AMPERE 4
3585
+ #define MMQ_Y_Q4_0_AMPERE 32
3586
+ #define NWARPS_Q4_0_AMPERE 4
3587
+ #else
3557
3588
  #define MMQ_X_Q4_0_AMPERE 64
3558
3589
  #define MMQ_Y_Q4_0_AMPERE 128
3559
3590
  #define NWARPS_Q4_0_AMPERE 4
3591
+ #endif
3560
3592
  #define MMQ_X_Q4_0_PASCAL 64
3561
3593
  #define MMQ_Y_Q4_0_PASCAL 64
3562
3594
  #define NWARPS_Q4_0_PASCAL 8
@@ -3615,9 +3647,15 @@ template <bool need_check> static __global__ void
3615
3647
  #define MMQ_X_Q4_1_RDNA1 64
3616
3648
  #define MMQ_Y_Q4_1_RDNA1 64
3617
3649
  #define NWARPS_Q4_1_RDNA1 8
3650
+ #if defined(CUDA_USE_TENSOR_CORES)
3651
+ #define MMQ_X_Q4_1_AMPERE 4
3652
+ #define MMQ_Y_Q4_1_AMPERE 32
3653
+ #define NWARPS_Q4_1_AMPERE 4
3654
+ #else
3618
3655
  #define MMQ_X_Q4_1_AMPERE 64
3619
3656
  #define MMQ_Y_Q4_1_AMPERE 128
3620
3657
  #define NWARPS_Q4_1_AMPERE 4
3658
+ #endif
3621
3659
  #define MMQ_X_Q4_1_PASCAL 64
3622
3660
  #define MMQ_Y_Q4_1_PASCAL 64
3623
3661
  #define NWARPS_Q4_1_PASCAL 8
@@ -3678,9 +3716,15 @@ template <bool need_check> static __global__ void
3678
3716
  #define MMQ_X_Q5_0_RDNA1 64
3679
3717
  #define MMQ_Y_Q5_0_RDNA1 64
3680
3718
  #define NWARPS_Q5_0_RDNA1 8
3719
+ #if defined(CUDA_USE_TENSOR_CORES)
3720
+ #define MMQ_X_Q5_0_AMPERE 4
3721
+ #define MMQ_Y_Q5_0_AMPERE 32
3722
+ #define NWARPS_Q5_0_AMPERE 4
3723
+ #else
3681
3724
  #define MMQ_X_Q5_0_AMPERE 128
3682
3725
  #define MMQ_Y_Q5_0_AMPERE 64
3683
3726
  #define NWARPS_Q5_0_AMPERE 4
3727
+ #endif
3684
3728
  #define MMQ_X_Q5_0_PASCAL 64
3685
3729
  #define MMQ_Y_Q5_0_PASCAL 64
3686
3730
  #define NWARPS_Q5_0_PASCAL 8
@@ -3739,9 +3783,15 @@ template <bool need_check> static __global__ void
3739
3783
  #define MMQ_X_Q5_1_RDNA1 64
3740
3784
  #define MMQ_Y_Q5_1_RDNA1 64
3741
3785
  #define NWARPS_Q5_1_RDNA1 8
3786
+ #if defined(CUDA_USE_TENSOR_CORES)
3787
+ #define MMQ_X_Q5_1_AMPERE 4
3788
+ #define MMQ_Y_Q5_1_AMPERE 32
3789
+ #define NWARPS_Q5_1_AMPERE 4
3790
+ #else
3742
3791
  #define MMQ_X_Q5_1_AMPERE 128
3743
3792
  #define MMQ_Y_Q5_1_AMPERE 64
3744
3793
  #define NWARPS_Q5_1_AMPERE 4
3794
+ #endif
3745
3795
  #define MMQ_X_Q5_1_PASCAL 64
3746
3796
  #define MMQ_Y_Q5_1_PASCAL 64
3747
3797
  #define NWARPS_Q5_1_PASCAL 8
@@ -3800,9 +3850,15 @@ mul_mat_q5_1(
3800
3850
  #define MMQ_X_Q8_0_RDNA1 64
3801
3851
  #define MMQ_Y_Q8_0_RDNA1 64
3802
3852
  #define NWARPS_Q8_0_RDNA1 8
3853
+ #if defined(CUDA_USE_TENSOR_CORES)
3854
+ #define MMQ_X_Q8_0_AMPERE 4
3855
+ #define MMQ_Y_Q8_0_AMPERE 32
3856
+ #define NWARPS_Q8_0_AMPERE 4
3857
+ #else
3803
3858
  #define MMQ_X_Q8_0_AMPERE 128
3804
3859
  #define MMQ_Y_Q8_0_AMPERE 64
3805
3860
  #define NWARPS_Q8_0_AMPERE 4
3861
+ #endif
3806
3862
  #define MMQ_X_Q8_0_PASCAL 64
3807
3863
  #define MMQ_Y_Q8_0_PASCAL 64
3808
3864
  #define NWARPS_Q8_0_PASCAL 8
@@ -3861,9 +3917,15 @@ template <bool need_check> static __global__ void
3861
3917
  #define MMQ_X_Q2_K_RDNA1 128
3862
3918
  #define MMQ_Y_Q2_K_RDNA1 32
3863
3919
  #define NWARPS_Q2_K_RDNA1 8
3920
+ #if defined(CUDA_USE_TENSOR_CORES)
3921
+ #define MMQ_X_Q2_K_AMPERE 4
3922
+ #define MMQ_Y_Q2_K_AMPERE 32
3923
+ #define NWARPS_Q2_K_AMPERE 4
3924
+ #else
3864
3925
  #define MMQ_X_Q2_K_AMPERE 64
3865
3926
  #define MMQ_Y_Q2_K_AMPERE 128
3866
3927
  #define NWARPS_Q2_K_AMPERE 4
3928
+ #endif
3867
3929
  #define MMQ_X_Q2_K_PASCAL 64
3868
3930
  #define MMQ_Y_Q2_K_PASCAL 64
3869
3931
  #define NWARPS_Q2_K_PASCAL 8
@@ -3922,9 +3984,15 @@ mul_mat_q2_K(
3922
3984
  #define MMQ_X_Q3_K_RDNA1 32
3923
3985
  #define MMQ_Y_Q3_K_RDNA1 128
3924
3986
  #define NWARPS_Q3_K_RDNA1 8
3987
+ #if defined(CUDA_USE_TENSOR_CORES)
3988
+ #define MMQ_X_Q3_K_AMPERE 4
3989
+ #define MMQ_Y_Q3_K_AMPERE 32
3990
+ #define NWARPS_Q3_K_AMPERE 4
3991
+ #else
3925
3992
  #define MMQ_X_Q3_K_AMPERE 128
3926
3993
  #define MMQ_Y_Q3_K_AMPERE 128
3927
3994
  #define NWARPS_Q3_K_AMPERE 4
3995
+ #endif
3928
3996
  #define MMQ_X_Q3_K_PASCAL 64
3929
3997
  #define MMQ_Y_Q3_K_PASCAL 64
3930
3998
  #define NWARPS_Q3_K_PASCAL 8
@@ -3985,9 +4053,15 @@ template <bool need_check> static __global__ void
3985
4053
  #define MMQ_X_Q4_K_RDNA1 32
3986
4054
  #define MMQ_Y_Q4_K_RDNA1 64
3987
4055
  #define NWARPS_Q4_K_RDNA1 8
4056
+ #if defined(CUDA_USE_TENSOR_CORES)
4057
+ #define MMQ_X_Q4_K_AMPERE 4
4058
+ #define MMQ_Y_Q4_K_AMPERE 32
4059
+ #define NWARPS_Q4_K_AMPERE 4
4060
+ #else
3988
4061
  #define MMQ_X_Q4_K_AMPERE 64
3989
4062
  #define MMQ_Y_Q4_K_AMPERE 128
3990
4063
  #define NWARPS_Q4_K_AMPERE 4
4064
+ #endif
3991
4065
  #define MMQ_X_Q4_K_PASCAL 64
3992
4066
  #define MMQ_Y_Q4_K_PASCAL 64
3993
4067
  #define NWARPS_Q4_K_PASCAL 8
@@ -4048,9 +4122,15 @@ template <bool need_check> static __global__ void
4048
4122
  #define MMQ_X_Q5_K_RDNA1 32
4049
4123
  #define MMQ_Y_Q5_K_RDNA1 64
4050
4124
  #define NWARPS_Q5_K_RDNA1 8
4125
+ #if defined(CUDA_USE_TENSOR_CORES)
4126
+ #define MMQ_X_Q5_K_AMPERE 4
4127
+ #define MMQ_Y_Q5_K_AMPERE 32
4128
+ #define NWARPS_Q5_K_AMPERE 4
4129
+ #else
4051
4130
  #define MMQ_X_Q5_K_AMPERE 64
4052
4131
  #define MMQ_Y_Q5_K_AMPERE 128
4053
4132
  #define NWARPS_Q5_K_AMPERE 4
4133
+ #endif
4054
4134
  #define MMQ_X_Q5_K_PASCAL 64
4055
4135
  #define MMQ_Y_Q5_K_PASCAL 64
4056
4136
  #define NWARPS_Q5_K_PASCAL 8
@@ -4109,9 +4189,15 @@ mul_mat_q5_K(
4109
4189
  #define MMQ_X_Q6_K_RDNA1 32
4110
4190
  #define MMQ_Y_Q6_K_RDNA1 64
4111
4191
  #define NWARPS_Q6_K_RDNA1 8
4192
+ #if defined(CUDA_USE_TENSOR_CORES)
4193
+ #define MMQ_X_Q6_K_AMPERE 4
4194
+ #define MMQ_Y_Q6_K_AMPERE 32
4195
+ #define NWARPS_Q6_K_AMPERE 4
4196
+ #else
4112
4197
  #define MMQ_X_Q6_K_AMPERE 64
4113
4198
  #define MMQ_Y_Q6_K_AMPERE 64
4114
4199
  #define NWARPS_Q6_K_AMPERE 4
4200
+ #endif
4115
4201
  #define MMQ_X_Q6_K_PASCAL 64
4116
4202
  #define MMQ_Y_Q6_K_PASCAL 64
4117
4203
  #define NWARPS_Q6_K_PASCAL 8
@@ -4168,7 +4254,7 @@ template <bool need_check> static __global__ void
4168
4254
 
4169
4255
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
4170
4256
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
4171
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4257
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4172
4258
 
4173
4259
  if (row >= nrows) {
4174
4260
  return;
@@ -4208,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
4208
4294
  static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
4209
4295
  // qk = quantized weights per x block
4210
4296
  // qr = number of quantized weights per data value in x block
4211
- const int row = blockIdx.y*blockDim.y + threadIdx.y;
4297
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
4212
4298
 
4213
4299
  if (row >= nrows) {
4214
4300
  return;
@@ -4407,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4407
4493
  cpy_1(cx + x_offset, cdst + dst_offset);
4408
4494
  }
4409
4495
 
4410
- // rope == RoPE == rotary positional embedding
4496
+ static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
4497
+ const float y = (i0 / 2 - low) / max(0.001f, high - low);
4498
+ return 1.0f - min(1.0f, max(0.0f, y));
4499
+ }
4500
+
4501
+ struct rope_corr_dims {
4502
+ float v[4];
4503
+ };
4504
+
4505
+ // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
4506
+ // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
4507
+ static __device__ void rope_yarn(
4508
+ float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
4509
+ float * cos_theta, float * sin_theta
4510
+ ) {
4511
+ // Get n-d rotational scaling corrected for extrapolation
4512
+ float theta_interp = freq_scale * theta_extrap;
4513
+ float theta = theta_interp;
4514
+ if (ext_factor != 0.0f) {
4515
+ float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
4516
+ theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
4411
4517
 
4518
+ // Get n-d magnitude scaling corrected for interpolation
4519
+ mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
4520
+ }
4521
+ *cos_theta = cosf(theta) * mscale;
4522
+ *sin_theta = sinf(theta) * mscale;
4523
+ }
4524
+
4525
+ // rope == RoPE == rotary positional embedding
4412
4526
  template<typename T, bool has_pos>
4413
- static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4414
- const int p_delta_rows, const float theta_scale) {
4527
+ static __global__ void rope(
4528
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4529
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4530
+ ) {
4415
4531
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4416
4532
 
4417
4533
  if (col >= ncols) {
@@ -4423,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4423
4539
  const int i2 = row/p_delta_rows;
4424
4540
 
4425
4541
  const int p = has_pos ? pos[i2] : 0;
4426
- const float p0 = p*freq_scale;
4427
- const float theta = p0*powf(theta_scale, col/2);
4428
- const float sin_theta = sinf(theta);
4429
- const float cos_theta = cosf(theta);
4542
+ const float theta_base = p*powf(freq_base, -float(col)/ncols);
4543
+
4544
+ float cos_theta, sin_theta;
4545
+ rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
4430
4546
 
4431
4547
  const float x0 = x[i + 0];
4432
4548
  const float x1 = x[i + 1];
@@ -4436,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
4436
4552
  }
4437
4553
 
4438
4554
  template<typename T, bool has_pos>
4439
- static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4440
- const int p_delta_rows, const float theta_scale) {
4555
+ static __global__ void rope_neox(
4556
+ const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4557
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims
4558
+ ) {
4441
4559
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4442
4560
 
4443
4561
  if (col >= ncols) {
@@ -4448,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4448
4566
  const int i = row*ncols + col/2;
4449
4567
  const int i2 = row/p_delta_rows;
4450
4568
 
4569
+ // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4570
+ const float cur_rot = -float(col)/ncols;
4571
+
4451
4572
  const int p = has_pos ? pos[i2] : 0;
4452
- const float p0 = p*freq_scale;
4453
- const float theta = p0*powf(theta_scale, col/2);
4454
- const float sin_theta = sinf(theta);
4455
- const float cos_theta = cosf(theta);
4573
+ const float theta_base = p*powf(freq_base, cur_rot);
4574
+
4575
+ float cos_theta, sin_theta;
4576
+ rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4456
4577
 
4457
4578
  const float x0 = x[i + 0];
4458
4579
  const float x1 = x[i + ncols/2];
@@ -4461,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
4461
4582
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4462
4583
  }
4463
4584
 
4464
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
4465
- const int p_delta_rows, const float theta_scale, const int n_ctx) {
4585
+ static __global__ void rope_glm_f32(
4586
+ const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4587
+ int n_ctx
4588
+ ) {
4466
4589
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4467
4590
  const int half_n_dims = ncols/4;
4468
4591
 
@@ -4474,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4474
4597
  const int i = row*ncols + col;
4475
4598
  const int i2 = row/p_delta_rows;
4476
4599
 
4477
- const float col_theta_scale = powf(theta_scale, col);
4600
+ const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
4478
4601
  // FIXME: this is likely wrong
4479
4602
  const int p = pos != nullptr ? pos[i2] : 0;
4480
4603
 
@@ -4616,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
4616
4739
  add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4617
4740
  }
4618
4741
 
4742
+ static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
4743
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4744
+ add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
4745
+ }
4746
+
4619
4747
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4620
4748
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
4621
4749
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -4739,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
4739
4867
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4740
4868
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4741
4869
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4742
- const dim3 block_nums(1, block_num_y, 1);
4870
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
4871
+ const dim3 block_nums(block_num_y, 1, 1);
4743
4872
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4744
4873
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
4745
4874
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4748,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
4748
4877
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4749
4878
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4750
4879
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4751
- const dim3 block_nums(1, block_num_y, 1);
4880
+ const dim3 block_nums(block_num_y, 1, 1);
4752
4881
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4753
4882
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
4754
4883
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4757,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
4757
4886
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4758
4887
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4759
4888
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4760
- const dim3 block_nums(1, block_num_y, 1);
4889
+ const dim3 block_nums(block_num_y, 1, 1);
4761
4890
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4762
4891
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
4763
4892
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4766,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
4766
4895
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4767
4896
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4768
4897
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4769
- const dim3 block_nums(1, block_num_y, 1);
4898
+ const dim3 block_nums(block_num_y, 1, 1);
4770
4899
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4771
4900
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
4772
4901
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4775,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
4775
4904
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4776
4905
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4777
4906
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4778
- const dim3 block_nums(1, block_num_y, 1);
4907
+ const dim3 block_nums(block_num_y, 1, 1);
4779
4908
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4780
4909
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
4781
4910
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4785,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
4785
4914
  GGML_ASSERT(ncols % QK_K == 0);
4786
4915
  const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
4787
4916
  const int block_num_y = (nrows + ny - 1) / ny;
4788
- const dim3 block_nums(1, block_num_y, 1);
4917
+ const dim3 block_nums(block_num_y, 1, 1);
4789
4918
  const dim3 block_dims(32, ny, 1);
4790
4919
  dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4791
4920
  }
@@ -4794,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
4794
4923
  GGML_ASSERT(ncols % QK_K == 0);
4795
4924
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4796
4925
  const int block_num_y = (nrows + ny - 1) / ny;
4797
- const dim3 block_nums(1, block_num_y, 1);
4926
+ const dim3 block_nums(block_num_y, 1, 1);
4798
4927
  const dim3 block_dims(32, ny, 1);
4799
4928
  dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4800
4929
  }
@@ -4803,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
4803
4932
  GGML_ASSERT(ncols % QK_K == 0);
4804
4933
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4805
4934
  const int block_num_y = (nrows + ny - 1) / ny;
4806
- const dim3 block_nums(1, block_num_y, 1);
4935
+ const dim3 block_nums(block_num_y, 1, 1);
4807
4936
  const dim3 block_dims(32, ny, 1);
4808
4937
  dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4809
4938
  }
@@ -4818,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4818
4947
  GGML_ASSERT(ncols % QK_K == 0);
4819
4948
  const int ny = 2 / K_QUANTS_PER_ITERATION;
4820
4949
  const int block_num_y = (nrows + ny - 1) / ny;
4821
- const dim3 block_nums(1, block_num_y, 1);
4950
+ const dim3 block_nums(block_num_y, 1, 1);
4822
4951
  const dim3 block_dims(32, ny, 1);
4823
4952
  dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4824
4953
  }
@@ -4826,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
4826
4955
  static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4827
4956
  GGML_ASSERT(ncols % QK4_0 == 0);
4828
4957
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4829
- const dim3 block_nums(1, block_num_y, 1);
4958
+ const dim3 block_nums(block_num_y, 1, 1);
4830
4959
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4831
4960
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
4832
4961
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4835,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
4835
4964
  static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4836
4965
  GGML_ASSERT(ncols % QK4_1 == 0);
4837
4966
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4838
- const dim3 block_nums(1, block_num_y, 1);
4967
+ const dim3 block_nums(block_num_y, 1, 1);
4839
4968
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4840
4969
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
4841
4970
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4844,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
4844
4973
  static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4845
4974
  GGML_ASSERT(ncols % QK5_0 == 0);
4846
4975
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4847
- const dim3 block_nums(1, block_num_y, 1);
4976
+ const dim3 block_nums(block_num_y, 1, 1);
4848
4977
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4849
4978
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
4850
4979
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4853,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
4853
4982
  static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4854
4983
  GGML_ASSERT(ncols % QK5_1 == 0);
4855
4984
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4856
- const dim3 block_nums(1, block_num_y, 1);
4985
+ const dim3 block_nums(block_num_y, 1, 1);
4857
4986
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4858
4987
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
4859
4988
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4862,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
4862
4991
  static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4863
4992
  GGML_ASSERT(ncols % QK8_0 == 0);
4864
4993
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4865
- const dim3 block_nums(1, block_num_y, 1);
4994
+ const dim3 block_nums(block_num_y, 1, 1);
4866
4995
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4867
4996
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
4868
4997
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4871,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
4871
5000
  static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4872
5001
  GGML_ASSERT(ncols % QK_K == 0);
4873
5002
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4874
- const dim3 block_nums(1, block_num_y, 1);
5003
+ const dim3 block_nums(block_num_y, 1, 1);
4875
5004
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4876
5005
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
4877
5006
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4880,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
4880
5009
  static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4881
5010
  GGML_ASSERT(ncols % QK_K == 0);
4882
5011
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4883
- const dim3 block_nums(1, block_num_y, 1);
5012
+ const dim3 block_nums(block_num_y, 1, 1);
4884
5013
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4885
5014
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
4886
5015
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4889,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
4889
5018
  static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4890
5019
  GGML_ASSERT(ncols % QK_K == 0);
4891
5020
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4892
- const dim3 block_nums(1, block_num_y, 1);
5021
+ const dim3 block_nums(block_num_y, 1, 1);
4893
5022
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4894
5023
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
4895
5024
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4898,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
4898
5027
  static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4899
5028
  GGML_ASSERT(ncols % QK_K == 0);
4900
5029
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4901
- const dim3 block_nums(1, block_num_y, 1);
5030
+ const dim3 block_nums(block_num_y, 1, 1);
4902
5031
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4903
5032
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
4904
5033
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4907,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
4907
5036
  static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4908
5037
  GGML_ASSERT(ncols % QK_K == 0);
4909
5038
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4910
- const dim3 block_nums(1, block_num_y, 1);
5039
+ const dim3 block_nums(block_num_y, 1, 1);
4911
5040
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4912
5041
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
4913
5042
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4926,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
4926
5055
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4927
5056
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4928
5057
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
4929
- const dim3 block_nums(1, block_num_y, 1);
5058
+ const dim3 block_nums(block_num_y, 1, 1);
4930
5059
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
4931
5060
  dequantize_mul_mat_vec<1, 1, convert_f16>
4932
5061
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -5493,40 +5622,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
5493
5622
  }
5494
5623
 
5495
5624
  template<typename T>
5496
- static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5497
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5625
+ static void rope_cuda(
5626
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5627
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5628
+ ) {
5498
5629
  GGML_ASSERT(ncols % 2 == 0);
5499
5630
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5500
5631
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5501
5632
  const dim3 block_nums(nrows, num_blocks_x, 1);
5502
5633
  if (pos == nullptr) {
5503
- rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5634
+ rope<T, false><<<block_nums, block_dims, 0, stream>>>(
5635
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5636
+ );
5504
5637
  } else {
5505
- rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5638
+ rope<T, true><<<block_nums, block_dims, 0, stream>>>(
5639
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5640
+ );
5506
5641
  }
5507
5642
  }
5508
5643
 
5509
5644
  template<typename T>
5510
- static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5511
- const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5645
+ static void rope_neox_cuda(
5646
+ const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5647
+ float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5648
+ ) {
5512
5649
  GGML_ASSERT(ncols % 2 == 0);
5513
5650
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5514
5651
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5515
5652
  const dim3 block_nums(nrows, num_blocks_x, 1);
5516
5653
  if (pos == nullptr) {
5517
- rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5654
+ rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5655
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5656
+ );
5518
5657
  } else {
5519
- rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5658
+ rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5659
+ x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5660
+ );
5520
5661
  }
5521
5662
  }
5522
5663
 
5523
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5524
- const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5664
+ static void rope_glm_f32_cuda(
5665
+ const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5666
+ float freq_base, int n_ctx, cudaStream_t stream
5667
+ ) {
5525
5668
  GGML_ASSERT(ncols % 4 == 0);
5526
5669
  const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5527
5670
  const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
5528
5671
  const dim3 block_nums(num_blocks_x, nrows, 1);
5529
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
5672
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
5530
5673
  }
5531
5674
 
5532
5675
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5647,6 +5790,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
5647
5790
  CUDA_CHECK(cudaFree(ptr));
5648
5791
  }
5649
5792
 
5793
+ static bool g_cublas_loaded = false;
5794
+
5795
+ bool ggml_cublas_loaded(void) {
5796
+ return g_cublas_loaded;
5797
+ }
5650
5798
 
5651
5799
  void ggml_init_cublas() {
5652
5800
  static bool initialized = false;
@@ -5660,9 +5808,24 @@ void ggml_init_cublas() {
5660
5808
  CUDA_CHECK(cudaDeviceSynchronize());
5661
5809
  #endif
5662
5810
 
5663
- CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5811
+ if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
5812
+ initialized = true;
5813
+ g_cublas_loaded = false;
5814
+ return;
5815
+ }
5816
+
5664
5817
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5665
5818
  int64_t total_vram = 0;
5819
+ #if defined(GGML_CUDA_FORCE_MMQ)
5820
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
5821
+ #else
5822
+ fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
5823
+ #endif
5824
+ #if defined(CUDA_USE_TENSOR_CORES)
5825
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
5826
+ #else
5827
+ fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
5828
+ #endif
5666
5829
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5667
5830
  for (int id = 0; id < g_device_count; ++id) {
5668
5831
  cudaDeviceProp prop;
@@ -5698,6 +5861,7 @@ void ggml_init_cublas() {
5698
5861
  // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
5699
5862
 
5700
5863
  initialized = true;
5864
+ g_cublas_loaded = true;
5701
5865
  }
5702
5866
  }
5703
5867
 
@@ -5909,7 +6073,10 @@ inline void ggml_cuda_op_add(
5909
6073
  add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5910
6074
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5911
6075
  add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
6076
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
6077
+ add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
5912
6078
  } else {
6079
+ fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
5913
6080
  GGML_ASSERT(false);
5914
6081
  }
5915
6082
 
@@ -6347,7 +6514,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6347
6514
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6348
6515
  row_diff, src1_ncols, ne10,
6349
6516
  &alpha, src0_ddf_i, ne00,
6350
- src1_ddf_i, ne10,
6517
+ src1_ddf_i, ne10,
6351
6518
  &beta, dst_dd_i, ldc));
6352
6519
 
6353
6520
  if (src0_as != 0) {
@@ -6373,17 +6540,20 @@ inline void ggml_cuda_op_rope(
6373
6540
  const int64_t ne2 = dst->ne[2];
6374
6541
  const int64_t nrows = ggml_nrows(src0);
6375
6542
 
6376
- //const int n_past = ((int32_t *) dst->op_params)[0];
6377
- const int n_dims = ((int32_t *) dst->op_params)[1];
6378
- const int mode = ((int32_t *) dst->op_params)[2];
6379
- const int n_ctx = ((int32_t *) dst->op_params)[3];
6380
- // RoPE alteration for extended context
6381
-
6382
- float freq_base, freq_scale;
6383
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
6384
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
6543
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6544
+ const int n_dims = ((int32_t *) dst->op_params)[1];
6545
+ const int mode = ((int32_t *) dst->op_params)[2];
6546
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
6547
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
6385
6548
 
6386
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
6549
+ // RoPE alteration for extended context
6550
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
6551
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
6552
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
6553
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
6554
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
6555
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
6556
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
6387
6557
 
6388
6558
  const int32_t * pos = nullptr;
6389
6559
  if ((mode & 1) == 0) {
@@ -6395,24 +6565,39 @@ inline void ggml_cuda_op_rope(
6395
6565
  const bool is_neox = mode & 2;
6396
6566
  const bool is_glm = mode & 4;
6397
6567
 
6568
+ rope_corr_dims corr_dims;
6569
+ ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
6570
+
6398
6571
  // compute
6399
6572
  if (is_glm) {
6400
6573
  GGML_ASSERT(false);
6401
- rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6574
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6402
6575
  } else if (is_neox) {
6403
6576
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6404
6577
  if (src0->type == GGML_TYPE_F32) {
6405
- rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6578
+ rope_neox_cuda(
6579
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6580
+ attn_factor, corr_dims, main_stream
6581
+ );
6406
6582
  } else if (src0->type == GGML_TYPE_F16) {
6407
- rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6583
+ rope_neox_cuda(
6584
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6585
+ attn_factor, corr_dims, main_stream
6586
+ );
6408
6587
  } else {
6409
6588
  GGML_ASSERT(false);
6410
6589
  }
6411
6590
  } else {
6412
6591
  if (src0->type == GGML_TYPE_F32) {
6413
- rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6592
+ rope_cuda(
6593
+ (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6594
+ attn_factor, corr_dims, main_stream
6595
+ );
6414
6596
  } else if (src0->type == GGML_TYPE_F16) {
6415
- rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6597
+ rope_cuda(
6598
+ (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6599
+ attn_factor, corr_dims, main_stream
6600
+ );
6416
6601
  } else {
6417
6602
  GGML_ASSERT(false);
6418
6603
  }
@@ -6523,8 +6708,10 @@ inline void ggml_cuda_op_clamp(
6523
6708
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6524
6709
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6525
6710
 
6526
- const float min = ((float *) dst->op_params)[0];
6527
- const float max = ((float *) dst->op_params)[1];
6711
+ float min;
6712
+ float max;
6713
+ memcpy(&min, dst->op_params, sizeof(float));
6714
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
6528
6715
 
6529
6716
  clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6530
6717
  CUDA_CHECK(cudaGetLastError());
@@ -6717,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
6717
6904
  int64_t row_low[GGML_CUDA_MAX_DEVICES];
6718
6905
  int64_t row_high[GGML_CUDA_MAX_DEVICES];
6719
6906
 
6907
+ int used_devices = 0;
6908
+
6720
6909
  for (int64_t id = 0; id < g_device_count; ++id) {
6721
6910
  // by default, use all rows
6722
6911
  row_low[id] = 0;
@@ -6744,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
6744
6933
  continue;
6745
6934
  }
6746
6935
 
6936
+ used_devices++;
6937
+
6747
6938
  const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6748
6939
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6749
6940
 
@@ -6782,12 +6973,12 @@ static void ggml_cuda_op_mul_mat(
6782
6973
 
6783
6974
  // if multiple devices are used they need to wait for the main device
6784
6975
  // here an event is recorded that signals that the main device has finished calculating the input data
6785
- if (split && g_device_count > 1) {
6976
+ if (split && used_devices > 1) {
6786
6977
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6787
6978
  CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6788
6979
  }
6789
6980
 
6790
- const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6981
+ const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6791
6982
  for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6792
6983
  const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6793
6984
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@@ -6903,6 +7094,9 @@ static void ggml_cuda_op_mul_mat(
6903
7094
  }
6904
7095
 
6905
7096
  for (int64_t id = 0; id < g_device_count; ++id) {
7097
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
7098
+ continue;
7099
+ }
6906
7100
  CUDA_CHECK(ggml_cuda_set_device(id));
6907
7101
 
6908
7102
  // free buffers again when done
@@ -6927,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(
6927
7121
 
6928
7122
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6929
7123
  for (int64_t id = 0; id < g_device_count; ++id) {
7124
+ if (row_low[id] == row_high[id]) {
7125
+ continue;
7126
+ }
6930
7127
  for (int64_t is = 0; is < is_max; ++is) {
6931
7128
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
6932
7129
  }
@@ -6972,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
6972
7169
  }
6973
7170
 
6974
7171
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7172
+ if (!g_cublas_loaded) return false;
7173
+
6975
7174
  const int64_t ne10 = src1->ne[0];
6976
7175
 
6977
7176
  const int64_t ne0 = dst->ne[0];
@@ -7048,9 +7247,34 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7048
7247
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7049
7248
  }
7050
7249
 
7051
- static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
7250
+ __global__ void k_compute_batched_ptrs(
7251
+ const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7252
+ const void ** ptrs_src, void ** ptrs_dst,
7253
+ int ne12, int ne13,
7254
+ int ne23,
7255
+ int nb02, int nb03,
7256
+ int nb12, int nb13,
7257
+ int nb2, int nb3,
7258
+ int r2, int r3) {
7259
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
7260
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
7261
+
7262
+ if (i13 >= ne13 || i12 >= ne12) {
7263
+ return;
7264
+ }
7265
+
7266
+ int i03 = i13 / r3;
7267
+ int i02 = i12 / r2;
7268
+
7269
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
7270
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7271
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7272
+ }
7273
+
7274
+ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7052
7275
  GGML_ASSERT(!ggml_is_transposed(src0));
7053
7276
  GGML_ASSERT(!ggml_is_transposed(src1));
7277
+
7054
7278
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
7055
7279
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
7056
7280
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -7148,49 +7372,45 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7148
7372
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7149
7373
  } else {
7150
7374
  // use cublasGemmBatchedEx
7151
- // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
7152
7375
  const int ne23 = ne12*ne13;
7153
7376
 
7154
- // TODO: avoid this alloc
7155
- void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
7156
-
7157
- for (int i13 = 0; i13 < ne13; ++i13) {
7158
- for (int i12 = 0; i12 < ne12; ++i12) {
7159
- int i03 = i13 / r3;
7160
- int i02 = i12 / r2;
7161
-
7162
- ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
7163
- ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
7164
- ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
7165
- }
7166
- }
7167
-
7168
- // allocate device memory for pointers
7169
- void ** ptrs_as = nullptr;
7170
- CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
7377
+ const void ** ptrs_src = nullptr;
7378
+ void ** ptrs_dst = nullptr;
7171
7379
 
7172
- // TODO: this does not work for some reason -- not sure why?
7173
- //size_t ptrs_s = 0;
7174
- //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7380
+ size_t ptrs_src_s = 0;
7381
+ size_t ptrs_dst_s = 0;
7175
7382
 
7176
- // copy pointers to device
7177
- CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
7383
+ ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
7384
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
7178
7385
 
7179
- free(ptrs);
7386
+ dim3 block_dims(ne13, ne12);
7387
+ k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7388
+ src0_as_f16, src1_as_f16, dst_f16,
7389
+ ptrs_src, ptrs_dst,
7390
+ ne12, ne13,
7391
+ ne23,
7392
+ nb02, nb03,
7393
+ nb12, nb13,
7394
+ dst->nb[2], dst->nb[3],
7395
+ r2, r3);
7396
+ CUDA_CHECK(cudaGetLastError());
7180
7397
 
7181
7398
  CUBLAS_CHECK(
7182
7399
  cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7183
7400
  ne01, ne11, ne10,
7184
- &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7185
- (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7186
- &beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7401
+ &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7402
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7403
+ &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
7187
7404
  ne23,
7188
7405
  CUBLAS_COMPUTE_16F,
7189
7406
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7190
7407
 
7191
- // free device memory for pointers
7192
- CUDA_CHECK(cudaFree(ptrs_as));
7193
- //ggml_cuda_pool_free(ptrs_as, ptrs_s);
7408
+ if (ptrs_src_s != 0) {
7409
+ ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
7410
+ }
7411
+ if (ptrs_dst_s != 0) {
7412
+ ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
7413
+ }
7194
7414
  }
7195
7415
  #endif
7196
7416
 
@@ -7202,17 +7422,26 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7202
7422
  }
7203
7423
 
7204
7424
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7205
- bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7206
- src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
7425
+ const bool all_on_device =
7426
+ (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
7427
+ (src1->backend == GGML_BACKEND_GPU) &&
7428
+ ( dst->backend == GGML_BACKEND_GPU);
7429
+
7430
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
7207
7431
 
7208
7432
  int64_t min_compute_capability = INT_MAX;
7209
7433
  for (int64_t id = 0; id < g_device_count; ++id) {
7210
- if (min_compute_capability > g_compute_capabilities[id]
7211
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7434
+ if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7212
7435
  min_compute_capability = g_compute_capabilities[id];
7213
7436
  }
7214
7437
  }
7215
7438
 
7439
+ #ifdef CUDA_USE_TENSOR_CORES
7440
+ const bool use_tensor_cores = true;
7441
+ #else
7442
+ const bool use_tensor_cores = false;
7443
+ #endif
7444
+
7216
7445
  // debug helpers
7217
7446
  //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
7218
7447
  //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -7221,20 +7450,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7221
7450
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
7222
7451
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
7223
7452
 
7224
- if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7453
+ if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7225
7454
  // KQ single-batch
7226
7455
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7227
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7456
+ } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
7228
7457
  // KQV single-batch
7229
7458
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7230
- } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
7459
+ } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
7231
7460
  // KQ + KQV multi-batch
7232
7461
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
7233
7462
  } else if (src0->type == GGML_TYPE_F32) {
7234
7463
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
7235
7464
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
7236
7465
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
7237
-
7238
7466
  #ifdef GGML_CUDA_FORCE_DMMV
7239
7467
  const bool use_mul_mat_vec_q = false;
7240
7468
  #else
@@ -7247,7 +7475,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7247
7475
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
7248
7476
  }
7249
7477
  } else {
7250
- if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
7478
+ bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
7479
+
7480
+ // when tensor cores are available, use them for large batch size
7481
+ // ref: https://github.com/ggerganov/llama.cpp/pull/3776
7482
+ if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
7483
+ use_mul_mat_q = false;
7484
+ }
7485
+
7486
+ if (use_mul_mat_q) {
7251
7487
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
7252
7488
  } else {
7253
7489
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -7601,10 +7837,6 @@ void ggml_cuda_set_main_device(const int main_device) {
7601
7837
  }
7602
7838
  }
7603
7839
 
7604
- void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7605
- g_mul_mat_q = mul_mat_q;
7606
- }
7607
-
7608
7840
  void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7609
7841
  // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7610
7842
  // it still won't always work as expected, but it's better than nothing
@@ -7624,6 +7856,8 @@ void ggml_cuda_free_scratch() {
7624
7856
  }
7625
7857
 
7626
7858
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7859
+ if (!g_cublas_loaded) return false;
7860
+
7627
7861
  ggml_cuda_func_t func;
7628
7862
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7629
7863
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))