llama_cpp 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +288 -92
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +58 -37
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +939 -3333
- data/ext/llama_cpp/src/ggml.h +25 -4
- data/ext/llama_cpp/src/llama.cpp +1819 -2554
- data/ext/llama_cpp/src/llama.h +32 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
@@ -87,6 +87,24 @@
|
|
87
87
|
#define CC_OFFSET_AMD 1000000
|
88
88
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
89
89
|
|
90
|
+
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
|
+
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
|
+
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
93
|
+
// - 7B quantum model: +100-200 MB
|
94
|
+
// - 13B quantum model: +200-400 MB
|
95
|
+
//
|
96
|
+
//#define GGML_CUDA_FORCE_MMQ
|
97
|
+
|
98
|
+
// TODO: improve this to be correct for more hardware
|
99
|
+
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
100
|
+
// probably other such cases, and not sure what happens on AMD hardware
|
101
|
+
#if !defined(GGML_CUDA_FORCE_MMQ)
|
102
|
+
#define CUDA_USE_TENSOR_CORES
|
103
|
+
#endif
|
104
|
+
|
105
|
+
// max batch size to use MMQ kernels when tensor cores are available
|
106
|
+
#define MMQ_MAX_BATCH_SIZE 32
|
107
|
+
|
90
108
|
#if defined(GGML_USE_HIPBLAS)
|
91
109
|
#define __CUDA_ARCH__ 1300
|
92
110
|
|
@@ -470,7 +488,6 @@ static int g_device_count = -1;
|
|
470
488
|
static int g_main_device = 0;
|
471
489
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
472
490
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
473
|
-
static bool g_mul_mat_q = true;
|
474
491
|
|
475
492
|
static void * g_scratch_buffer = nullptr;
|
476
493
|
static size_t g_scratch_size = 0; // disabled by default
|
@@ -496,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
|
|
496
513
|
dst[i] = __hadd(x[i], __float2half(y[i]));
|
497
514
|
}
|
498
515
|
|
516
|
+
static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
|
517
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
518
|
+
|
519
|
+
if (i >= k) {
|
520
|
+
return;
|
521
|
+
}
|
522
|
+
dst[i] = __half2float(x[i]) + y[i];
|
523
|
+
}
|
524
|
+
|
499
525
|
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
500
526
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
501
527
|
|
@@ -3554,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3554
3580
|
#define MMQ_X_Q4_0_RDNA1 64
|
3555
3581
|
#define MMQ_Y_Q4_0_RDNA1 64
|
3556
3582
|
#define NWARPS_Q4_0_RDNA1 8
|
3583
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3584
|
+
#define MMQ_X_Q4_0_AMPERE 4
|
3585
|
+
#define MMQ_Y_Q4_0_AMPERE 32
|
3586
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3587
|
+
#else
|
3557
3588
|
#define MMQ_X_Q4_0_AMPERE 64
|
3558
3589
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3559
3590
|
#define NWARPS_Q4_0_AMPERE 4
|
3591
|
+
#endif
|
3560
3592
|
#define MMQ_X_Q4_0_PASCAL 64
|
3561
3593
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3562
3594
|
#define NWARPS_Q4_0_PASCAL 8
|
@@ -3615,9 +3647,15 @@ template <bool need_check> static __global__ void
|
|
3615
3647
|
#define MMQ_X_Q4_1_RDNA1 64
|
3616
3648
|
#define MMQ_Y_Q4_1_RDNA1 64
|
3617
3649
|
#define NWARPS_Q4_1_RDNA1 8
|
3650
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3651
|
+
#define MMQ_X_Q4_1_AMPERE 4
|
3652
|
+
#define MMQ_Y_Q4_1_AMPERE 32
|
3653
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3654
|
+
#else
|
3618
3655
|
#define MMQ_X_Q4_1_AMPERE 64
|
3619
3656
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3620
3657
|
#define NWARPS_Q4_1_AMPERE 4
|
3658
|
+
#endif
|
3621
3659
|
#define MMQ_X_Q4_1_PASCAL 64
|
3622
3660
|
#define MMQ_Y_Q4_1_PASCAL 64
|
3623
3661
|
#define NWARPS_Q4_1_PASCAL 8
|
@@ -3678,9 +3716,15 @@ template <bool need_check> static __global__ void
|
|
3678
3716
|
#define MMQ_X_Q5_0_RDNA1 64
|
3679
3717
|
#define MMQ_Y_Q5_0_RDNA1 64
|
3680
3718
|
#define NWARPS_Q5_0_RDNA1 8
|
3719
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3720
|
+
#define MMQ_X_Q5_0_AMPERE 4
|
3721
|
+
#define MMQ_Y_Q5_0_AMPERE 32
|
3722
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3723
|
+
#else
|
3681
3724
|
#define MMQ_X_Q5_0_AMPERE 128
|
3682
3725
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3683
3726
|
#define NWARPS_Q5_0_AMPERE 4
|
3727
|
+
#endif
|
3684
3728
|
#define MMQ_X_Q5_0_PASCAL 64
|
3685
3729
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3686
3730
|
#define NWARPS_Q5_0_PASCAL 8
|
@@ -3739,9 +3783,15 @@ template <bool need_check> static __global__ void
|
|
3739
3783
|
#define MMQ_X_Q5_1_RDNA1 64
|
3740
3784
|
#define MMQ_Y_Q5_1_RDNA1 64
|
3741
3785
|
#define NWARPS_Q5_1_RDNA1 8
|
3786
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3787
|
+
#define MMQ_X_Q5_1_AMPERE 4
|
3788
|
+
#define MMQ_Y_Q5_1_AMPERE 32
|
3789
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3790
|
+
#else
|
3742
3791
|
#define MMQ_X_Q5_1_AMPERE 128
|
3743
3792
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3744
3793
|
#define NWARPS_Q5_1_AMPERE 4
|
3794
|
+
#endif
|
3745
3795
|
#define MMQ_X_Q5_1_PASCAL 64
|
3746
3796
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3747
3797
|
#define NWARPS_Q5_1_PASCAL 8
|
@@ -3800,9 +3850,15 @@ mul_mat_q5_1(
|
|
3800
3850
|
#define MMQ_X_Q8_0_RDNA1 64
|
3801
3851
|
#define MMQ_Y_Q8_0_RDNA1 64
|
3802
3852
|
#define NWARPS_Q8_0_RDNA1 8
|
3853
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3854
|
+
#define MMQ_X_Q8_0_AMPERE 4
|
3855
|
+
#define MMQ_Y_Q8_0_AMPERE 32
|
3856
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3857
|
+
#else
|
3803
3858
|
#define MMQ_X_Q8_0_AMPERE 128
|
3804
3859
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3805
3860
|
#define NWARPS_Q8_0_AMPERE 4
|
3861
|
+
#endif
|
3806
3862
|
#define MMQ_X_Q8_0_PASCAL 64
|
3807
3863
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3808
3864
|
#define NWARPS_Q8_0_PASCAL 8
|
@@ -3861,9 +3917,15 @@ template <bool need_check> static __global__ void
|
|
3861
3917
|
#define MMQ_X_Q2_K_RDNA1 128
|
3862
3918
|
#define MMQ_Y_Q2_K_RDNA1 32
|
3863
3919
|
#define NWARPS_Q2_K_RDNA1 8
|
3920
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3921
|
+
#define MMQ_X_Q2_K_AMPERE 4
|
3922
|
+
#define MMQ_Y_Q2_K_AMPERE 32
|
3923
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3924
|
+
#else
|
3864
3925
|
#define MMQ_X_Q2_K_AMPERE 64
|
3865
3926
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3866
3927
|
#define NWARPS_Q2_K_AMPERE 4
|
3928
|
+
#endif
|
3867
3929
|
#define MMQ_X_Q2_K_PASCAL 64
|
3868
3930
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3869
3931
|
#define NWARPS_Q2_K_PASCAL 8
|
@@ -3922,9 +3984,15 @@ mul_mat_q2_K(
|
|
3922
3984
|
#define MMQ_X_Q3_K_RDNA1 32
|
3923
3985
|
#define MMQ_Y_Q3_K_RDNA1 128
|
3924
3986
|
#define NWARPS_Q3_K_RDNA1 8
|
3987
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3988
|
+
#define MMQ_X_Q3_K_AMPERE 4
|
3989
|
+
#define MMQ_Y_Q3_K_AMPERE 32
|
3990
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3991
|
+
#else
|
3925
3992
|
#define MMQ_X_Q3_K_AMPERE 128
|
3926
3993
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3927
3994
|
#define NWARPS_Q3_K_AMPERE 4
|
3995
|
+
#endif
|
3928
3996
|
#define MMQ_X_Q3_K_PASCAL 64
|
3929
3997
|
#define MMQ_Y_Q3_K_PASCAL 64
|
3930
3998
|
#define NWARPS_Q3_K_PASCAL 8
|
@@ -3985,9 +4053,15 @@ template <bool need_check> static __global__ void
|
|
3985
4053
|
#define MMQ_X_Q4_K_RDNA1 32
|
3986
4054
|
#define MMQ_Y_Q4_K_RDNA1 64
|
3987
4055
|
#define NWARPS_Q4_K_RDNA1 8
|
4056
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4057
|
+
#define MMQ_X_Q4_K_AMPERE 4
|
4058
|
+
#define MMQ_Y_Q4_K_AMPERE 32
|
4059
|
+
#define NWARPS_Q4_K_AMPERE 4
|
4060
|
+
#else
|
3988
4061
|
#define MMQ_X_Q4_K_AMPERE 64
|
3989
4062
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3990
4063
|
#define NWARPS_Q4_K_AMPERE 4
|
4064
|
+
#endif
|
3991
4065
|
#define MMQ_X_Q4_K_PASCAL 64
|
3992
4066
|
#define MMQ_Y_Q4_K_PASCAL 64
|
3993
4067
|
#define NWARPS_Q4_K_PASCAL 8
|
@@ -4048,9 +4122,15 @@ template <bool need_check> static __global__ void
|
|
4048
4122
|
#define MMQ_X_Q5_K_RDNA1 32
|
4049
4123
|
#define MMQ_Y_Q5_K_RDNA1 64
|
4050
4124
|
#define NWARPS_Q5_K_RDNA1 8
|
4125
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4126
|
+
#define MMQ_X_Q5_K_AMPERE 4
|
4127
|
+
#define MMQ_Y_Q5_K_AMPERE 32
|
4128
|
+
#define NWARPS_Q5_K_AMPERE 4
|
4129
|
+
#else
|
4051
4130
|
#define MMQ_X_Q5_K_AMPERE 64
|
4052
4131
|
#define MMQ_Y_Q5_K_AMPERE 128
|
4053
4132
|
#define NWARPS_Q5_K_AMPERE 4
|
4133
|
+
#endif
|
4054
4134
|
#define MMQ_X_Q5_K_PASCAL 64
|
4055
4135
|
#define MMQ_Y_Q5_K_PASCAL 64
|
4056
4136
|
#define NWARPS_Q5_K_PASCAL 8
|
@@ -4109,9 +4189,15 @@ mul_mat_q5_K(
|
|
4109
4189
|
#define MMQ_X_Q6_K_RDNA1 32
|
4110
4190
|
#define MMQ_Y_Q6_K_RDNA1 64
|
4111
4191
|
#define NWARPS_Q6_K_RDNA1 8
|
4192
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4193
|
+
#define MMQ_X_Q6_K_AMPERE 4
|
4194
|
+
#define MMQ_Y_Q6_K_AMPERE 32
|
4195
|
+
#define NWARPS_Q6_K_AMPERE 4
|
4196
|
+
#else
|
4112
4197
|
#define MMQ_X_Q6_K_AMPERE 64
|
4113
4198
|
#define MMQ_Y_Q6_K_AMPERE 64
|
4114
4199
|
#define NWARPS_Q6_K_AMPERE 4
|
4200
|
+
#endif
|
4115
4201
|
#define MMQ_X_Q6_K_PASCAL 64
|
4116
4202
|
#define MMQ_Y_Q6_K_PASCAL 64
|
4117
4203
|
#define NWARPS_Q6_K_PASCAL 8
|
@@ -4407,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4407
4493
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4408
4494
|
}
|
4409
4495
|
|
4410
|
-
|
4496
|
+
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4497
|
+
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4498
|
+
return 1.0f - min(1.0f, max(0.0f, y));
|
4499
|
+
}
|
4500
|
+
|
4501
|
+
struct rope_corr_dims {
|
4502
|
+
float v[4];
|
4503
|
+
};
|
4504
|
+
|
4505
|
+
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
4506
|
+
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
4507
|
+
static __device__ void rope_yarn(
|
4508
|
+
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
4509
|
+
float * cos_theta, float * sin_theta
|
4510
|
+
) {
|
4511
|
+
// Get n-d rotational scaling corrected for extrapolation
|
4512
|
+
float theta_interp = freq_scale * theta_extrap;
|
4513
|
+
float theta = theta_interp;
|
4514
|
+
if (ext_factor != 0.0f) {
|
4515
|
+
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
4516
|
+
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
4517
|
+
|
4518
|
+
// Get n-d magnitude scaling corrected for interpolation
|
4519
|
+
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
4520
|
+
}
|
4521
|
+
*cos_theta = cosf(theta) * mscale;
|
4522
|
+
*sin_theta = sinf(theta) * mscale;
|
4523
|
+
}
|
4411
4524
|
|
4525
|
+
// rope == RoPE == rotary positional embedding
|
4412
4526
|
template<typename T, bool has_pos>
|
4413
|
-
static __global__ void rope(
|
4414
|
-
|
4527
|
+
static __global__ void rope(
|
4528
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4529
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4530
|
+
) {
|
4415
4531
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4416
4532
|
|
4417
4533
|
if (col >= ncols) {
|
@@ -4423,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4423
4539
|
const int i2 = row/p_delta_rows;
|
4424
4540
|
|
4425
4541
|
const int p = has_pos ? pos[i2] : 0;
|
4426
|
-
const float
|
4427
|
-
|
4428
|
-
|
4429
|
-
|
4542
|
+
const float theta_base = p*powf(freq_base, -float(col)/ncols);
|
4543
|
+
|
4544
|
+
float cos_theta, sin_theta;
|
4545
|
+
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4430
4546
|
|
4431
4547
|
const float x0 = x[i + 0];
|
4432
4548
|
const float x1 = x[i + 1];
|
@@ -4436,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4436
4552
|
}
|
4437
4553
|
|
4438
4554
|
template<typename T, bool has_pos>
|
4439
|
-
static __global__ void rope_neox(
|
4440
|
-
|
4555
|
+
static __global__ void rope_neox(
|
4556
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4557
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4558
|
+
) {
|
4441
4559
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4442
4560
|
|
4443
4561
|
if (col >= ncols) {
|
@@ -4448,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4448
4566
|
const int i = row*ncols + col/2;
|
4449
4567
|
const int i2 = row/p_delta_rows;
|
4450
4568
|
|
4569
|
+
// simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
|
4570
|
+
const float cur_rot = -float(col)/ncols;
|
4571
|
+
|
4451
4572
|
const int p = has_pos ? pos[i2] : 0;
|
4452
|
-
const float
|
4453
|
-
|
4454
|
-
|
4455
|
-
|
4573
|
+
const float theta_base = p*powf(freq_base, cur_rot);
|
4574
|
+
|
4575
|
+
float cos_theta, sin_theta;
|
4576
|
+
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4456
4577
|
|
4457
4578
|
const float x0 = x[i + 0];
|
4458
4579
|
const float x1 = x[i + ncols/2];
|
@@ -4461,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4461
4582
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4462
4583
|
}
|
4463
4584
|
|
4464
|
-
static __global__ void rope_glm_f32(
|
4465
|
-
|
4585
|
+
static __global__ void rope_glm_f32(
|
4586
|
+
const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4587
|
+
int n_ctx
|
4588
|
+
) {
|
4466
4589
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4467
4590
|
const int half_n_dims = ncols/4;
|
4468
4591
|
|
@@ -4474,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4474
4597
|
const int i = row*ncols + col;
|
4475
4598
|
const int i2 = row/p_delta_rows;
|
4476
4599
|
|
4477
|
-
const float col_theta_scale = powf(
|
4600
|
+
const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
|
4478
4601
|
// FIXME: this is likely wrong
|
4479
4602
|
const int p = pos != nullptr ? pos[i2] : 0;
|
4480
4603
|
|
@@ -4616,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
|
|
4616
4739
|
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4617
4740
|
}
|
4618
4741
|
|
4742
|
+
static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
4743
|
+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4744
|
+
add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4745
|
+
}
|
4746
|
+
|
4619
4747
|
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4620
4748
|
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
4621
4749
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -5493,40 +5621,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
|
|
5493
5621
|
}
|
5494
5622
|
|
5495
5623
|
template<typename T>
|
5496
|
-
static void rope_cuda(
|
5497
|
-
|
5624
|
+
static void rope_cuda(
|
5625
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5626
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5627
|
+
) {
|
5498
5628
|
GGML_ASSERT(ncols % 2 == 0);
|
5499
5629
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5500
5630
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5501
5631
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5502
5632
|
if (pos == nullptr) {
|
5503
|
-
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5633
|
+
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5634
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5635
|
+
);
|
5504
5636
|
} else {
|
5505
|
-
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5637
|
+
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5638
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5639
|
+
);
|
5506
5640
|
}
|
5507
5641
|
}
|
5508
5642
|
|
5509
5643
|
template<typename T>
|
5510
|
-
static void rope_neox_cuda(
|
5511
|
-
|
5644
|
+
static void rope_neox_cuda(
|
5645
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5646
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5647
|
+
) {
|
5512
5648
|
GGML_ASSERT(ncols % 2 == 0);
|
5513
5649
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5514
5650
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5515
5651
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5516
5652
|
if (pos == nullptr) {
|
5517
|
-
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5653
|
+
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5654
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5655
|
+
);
|
5518
5656
|
} else {
|
5519
|
-
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5657
|
+
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5658
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5659
|
+
);
|
5520
5660
|
}
|
5521
5661
|
}
|
5522
5662
|
|
5523
|
-
static void rope_glm_f32_cuda(
|
5524
|
-
|
5663
|
+
static void rope_glm_f32_cuda(
|
5664
|
+
const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5665
|
+
float freq_base, int n_ctx, cudaStream_t stream
|
5666
|
+
) {
|
5525
5667
|
GGML_ASSERT(ncols % 4 == 0);
|
5526
5668
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5527
5669
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
5528
5670
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
5529
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5671
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
|
5530
5672
|
}
|
5531
5673
|
|
5532
5674
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5663,6 +5805,16 @@ void ggml_init_cublas() {
|
|
5663
5805
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
5664
5806
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5665
5807
|
int64_t total_vram = 0;
|
5808
|
+
#if defined(GGML_CUDA_FORCE_MMQ)
|
5809
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
5810
|
+
#else
|
5811
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
5812
|
+
#endif
|
5813
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
5814
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
5815
|
+
#else
|
5816
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
5817
|
+
#endif
|
5666
5818
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5667
5819
|
for (int id = 0; id < g_device_count; ++id) {
|
5668
5820
|
cudaDeviceProp prop;
|
@@ -5909,7 +6061,10 @@ inline void ggml_cuda_op_add(
|
|
5909
6061
|
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5910
6062
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5911
6063
|
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
6064
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6065
|
+
add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5912
6066
|
} else {
|
6067
|
+
fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
|
5913
6068
|
GGML_ASSERT(false);
|
5914
6069
|
}
|
5915
6070
|
|
@@ -6347,7 +6502,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6347
6502
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6348
6503
|
row_diff, src1_ncols, ne10,
|
6349
6504
|
&alpha, src0_ddf_i, ne00,
|
6350
|
-
src1_ddf_i,
|
6505
|
+
src1_ddf_i, ne10,
|
6351
6506
|
&beta, dst_dd_i, ldc));
|
6352
6507
|
|
6353
6508
|
if (src0_as != 0) {
|
@@ -6373,17 +6528,20 @@ inline void ggml_cuda_op_rope(
|
|
6373
6528
|
const int64_t ne2 = dst->ne[2];
|
6374
6529
|
const int64_t nrows = ggml_nrows(src0);
|
6375
6530
|
|
6376
|
-
//const int n_past
|
6377
|
-
const int n_dims
|
6378
|
-
const int mode
|
6379
|
-
const int n_ctx
|
6380
|
-
|
6381
|
-
|
6382
|
-
float freq_base, freq_scale;
|
6383
|
-
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
6384
|
-
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
6531
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6532
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
6533
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
6534
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
6535
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
6385
6536
|
|
6386
|
-
|
6537
|
+
// RoPE alteration for extended context
|
6538
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
6539
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
6540
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
6541
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
6542
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
6543
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
6544
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
6387
6545
|
|
6388
6546
|
const int32_t * pos = nullptr;
|
6389
6547
|
if ((mode & 1) == 0) {
|
@@ -6395,24 +6553,39 @@ inline void ggml_cuda_op_rope(
|
|
6395
6553
|
const bool is_neox = mode & 2;
|
6396
6554
|
const bool is_glm = mode & 4;
|
6397
6555
|
|
6556
|
+
rope_corr_dims corr_dims;
|
6557
|
+
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
6558
|
+
|
6398
6559
|
// compute
|
6399
6560
|
if (is_glm) {
|
6400
6561
|
GGML_ASSERT(false);
|
6401
|
-
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01,
|
6562
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6402
6563
|
} else if (is_neox) {
|
6403
6564
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6404
6565
|
if (src0->type == GGML_TYPE_F32) {
|
6405
|
-
rope_neox_cuda(
|
6566
|
+
rope_neox_cuda(
|
6567
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6568
|
+
attn_factor, corr_dims, main_stream
|
6569
|
+
);
|
6406
6570
|
} else if (src0->type == GGML_TYPE_F16) {
|
6407
|
-
rope_neox_cuda(
|
6571
|
+
rope_neox_cuda(
|
6572
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6573
|
+
attn_factor, corr_dims, main_stream
|
6574
|
+
);
|
6408
6575
|
} else {
|
6409
6576
|
GGML_ASSERT(false);
|
6410
6577
|
}
|
6411
6578
|
} else {
|
6412
6579
|
if (src0->type == GGML_TYPE_F32) {
|
6413
|
-
rope_cuda(
|
6580
|
+
rope_cuda(
|
6581
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6582
|
+
attn_factor, corr_dims, main_stream
|
6583
|
+
);
|
6414
6584
|
} else if (src0->type == GGML_TYPE_F16) {
|
6415
|
-
rope_cuda(
|
6585
|
+
rope_cuda(
|
6586
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6587
|
+
attn_factor, corr_dims, main_stream
|
6588
|
+
);
|
6416
6589
|
} else {
|
6417
6590
|
GGML_ASSERT(false);
|
6418
6591
|
}
|
@@ -6523,8 +6696,10 @@ inline void ggml_cuda_op_clamp(
|
|
6523
6696
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6524
6697
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6525
6698
|
|
6526
|
-
|
6527
|
-
|
6699
|
+
float min;
|
6700
|
+
float max;
|
6701
|
+
memcpy(&min, dst->op_params, sizeof(float));
|
6702
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
6528
6703
|
|
6529
6704
|
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6530
6705
|
CUDA_CHECK(cudaGetLastError());
|
@@ -7048,9 +7223,34 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7048
7223
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7049
7224
|
}
|
7050
7225
|
|
7051
|
-
|
7226
|
+
__global__ void k_compute_batched_ptrs(
|
7227
|
+
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7228
|
+
void ** ptrs,
|
7229
|
+
int ne12, int ne13,
|
7230
|
+
int ne23,
|
7231
|
+
int nb02, int nb03,
|
7232
|
+
int nb12, int nb13,
|
7233
|
+
int nb2, int nb3,
|
7234
|
+
int r2, int r3) {
|
7235
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
7236
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
7237
|
+
|
7238
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
7239
|
+
return;
|
7240
|
+
}
|
7241
|
+
|
7242
|
+
int i03 = i13 / r3;
|
7243
|
+
int i02 = i12 / r2;
|
7244
|
+
|
7245
|
+
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7246
|
+
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7247
|
+
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7248
|
+
}
|
7249
|
+
|
7250
|
+
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7052
7251
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
7053
7252
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
7253
|
+
|
7054
7254
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7055
7255
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
7056
7256
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -7148,49 +7348,35 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7148
7348
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7149
7349
|
} else {
|
7150
7350
|
// use cublasGemmBatchedEx
|
7151
|
-
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
7152
7351
|
const int ne23 = ne12*ne13;
|
7153
7352
|
|
7154
|
-
// TODO: avoid this alloc
|
7155
|
-
void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
|
7156
|
-
|
7157
|
-
for (int i13 = 0; i13 < ne13; ++i13) {
|
7158
|
-
for (int i12 = 0; i12 < ne12; ++i12) {
|
7159
|
-
int i03 = i13 / r3;
|
7160
|
-
int i02 = i12 / r2;
|
7161
|
-
|
7162
|
-
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
7163
|
-
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
7164
|
-
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
7165
|
-
}
|
7166
|
-
}
|
7167
|
-
|
7168
|
-
// allocate device memory for pointers
|
7169
7353
|
void ** ptrs_as = nullptr;
|
7170
|
-
|
7171
|
-
|
7172
|
-
|
7173
|
-
|
7174
|
-
|
7175
|
-
|
7176
|
-
|
7177
|
-
|
7178
|
-
|
7179
|
-
|
7354
|
+
size_t ptrs_s = 0;
|
7355
|
+
ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
7356
|
+
|
7357
|
+
dim3 block_dims(ne13, ne12);
|
7358
|
+
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7359
|
+
src0_as_f16, src1_as_f16, dst_f16,
|
7360
|
+
ptrs_as,
|
7361
|
+
ne12, ne13,
|
7362
|
+
ne23,
|
7363
|
+
nb02, nb03,
|
7364
|
+
nb12, nb13,
|
7365
|
+
dst->nb[2], dst->nb[3],
|
7366
|
+
r2, r3);
|
7367
|
+
CUDA_CHECK(cudaGetLastError());
|
7180
7368
|
|
7181
7369
|
CUBLAS_CHECK(
|
7182
7370
|
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7183
7371
|
ne01, ne11, ne10,
|
7184
|
-
&alpha_f16, (const void
|
7185
|
-
(const void
|
7186
|
-
&beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
7372
|
+
&alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7373
|
+
(const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7374
|
+
&beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
7187
7375
|
ne23,
|
7188
7376
|
CUBLAS_COMPUTE_16F,
|
7189
7377
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7190
7378
|
|
7191
|
-
|
7192
|
-
CUDA_CHECK(cudaFree(ptrs_as));
|
7193
|
-
//ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
7379
|
+
ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
7194
7380
|
}
|
7195
7381
|
#endif
|
7196
7382
|
|
@@ -7202,17 +7388,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7202
7388
|
}
|
7203
7389
|
|
7204
7390
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7205
|
-
bool all_on_device =
|
7206
|
-
|
7391
|
+
const bool all_on_device =
|
7392
|
+
(src0->backend == GGML_BACKEND_GPU) &&
|
7393
|
+
(src1->backend == GGML_BACKEND_GPU) &&
|
7394
|
+
( dst->backend == GGML_BACKEND_GPU);
|
7207
7395
|
|
7208
7396
|
int64_t min_compute_capability = INT_MAX;
|
7209
7397
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7210
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
7211
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7398
|
+
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7212
7399
|
min_compute_capability = g_compute_capabilities[id];
|
7213
7400
|
}
|
7214
7401
|
}
|
7215
7402
|
|
7403
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
7404
|
+
const bool use_tensor_cores = true;
|
7405
|
+
#else
|
7406
|
+
const bool use_tensor_cores = false;
|
7407
|
+
#endif
|
7408
|
+
|
7216
7409
|
// debug helpers
|
7217
7410
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
7218
7411
|
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
@@ -7221,20 +7414,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7221
7414
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7222
7415
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7223
7416
|
|
7224
|
-
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7417
|
+
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7225
7418
|
// KQ single-batch
|
7226
7419
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7227
|
-
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7420
|
+
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7228
7421
|
// KQV single-batch
|
7229
7422
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7230
|
-
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)
|
7423
|
+
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7231
7424
|
// KQ + KQV multi-batch
|
7232
7425
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7233
7426
|
} else if (src0->type == GGML_TYPE_F32) {
|
7234
7427
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7235
7428
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
7236
7429
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
7237
|
-
|
7238
7430
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7239
7431
|
const bool use_mul_mat_vec_q = false;
|
7240
7432
|
#else
|
@@ -7247,7 +7439,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7247
7439
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
7248
7440
|
}
|
7249
7441
|
} else {
|
7250
|
-
|
7442
|
+
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
7443
|
+
|
7444
|
+
// when tensor cores are available, use them for large batch size
|
7445
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
7446
|
+
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
7447
|
+
use_mul_mat_q = false;
|
7448
|
+
}
|
7449
|
+
|
7450
|
+
if (use_mul_mat_q) {
|
7251
7451
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
7252
7452
|
} else {
|
7253
7453
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
@@ -7601,10 +7801,6 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
7601
7801
|
}
|
7602
7802
|
}
|
7603
7803
|
|
7604
|
-
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
7605
|
-
g_mul_mat_q = mul_mat_q;
|
7606
|
-
}
|
7607
|
-
|
7608
7804
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7609
7805
|
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
7610
7806
|
// it still won't always work as expected, but it's better than nothing
|