llama_cpp 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +288 -92
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +58 -37
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +939 -3333
- data/ext/llama_cpp/src/ggml.h +25 -4
- data/ext/llama_cpp/src/llama.cpp +1819 -2554
- data/ext/llama_cpp/src/llama.h +32 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
@@ -87,6 +87,24 @@
|
|
87
87
|
#define CC_OFFSET_AMD 1000000
|
88
88
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
89
89
|
|
90
|
+
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
|
+
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
|
+
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
93
|
+
// - 7B quantum model: +100-200 MB
|
94
|
+
// - 13B quantum model: +200-400 MB
|
95
|
+
//
|
96
|
+
//#define GGML_CUDA_FORCE_MMQ
|
97
|
+
|
98
|
+
// TODO: improve this to be correct for more hardware
|
99
|
+
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
100
|
+
// probably other such cases, and not sure what happens on AMD hardware
|
101
|
+
#if !defined(GGML_CUDA_FORCE_MMQ)
|
102
|
+
#define CUDA_USE_TENSOR_CORES
|
103
|
+
#endif
|
104
|
+
|
105
|
+
// max batch size to use MMQ kernels when tensor cores are available
|
106
|
+
#define MMQ_MAX_BATCH_SIZE 32
|
107
|
+
|
90
108
|
#if defined(GGML_USE_HIPBLAS)
|
91
109
|
#define __CUDA_ARCH__ 1300
|
92
110
|
|
@@ -470,7 +488,6 @@ static int g_device_count = -1;
|
|
470
488
|
static int g_main_device = 0;
|
471
489
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
472
490
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
473
|
-
static bool g_mul_mat_q = true;
|
474
491
|
|
475
492
|
static void * g_scratch_buffer = nullptr;
|
476
493
|
static size_t g_scratch_size = 0; // disabled by default
|
@@ -496,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
|
|
496
513
|
dst[i] = __hadd(x[i], __float2half(y[i]));
|
497
514
|
}
|
498
515
|
|
516
|
+
static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
|
517
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
518
|
+
|
519
|
+
if (i >= k) {
|
520
|
+
return;
|
521
|
+
}
|
522
|
+
dst[i] = __half2float(x[i]) + y[i];
|
523
|
+
}
|
524
|
+
|
499
525
|
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
500
526
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
501
527
|
|
@@ -3554,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3554
3580
|
#define MMQ_X_Q4_0_RDNA1 64
|
3555
3581
|
#define MMQ_Y_Q4_0_RDNA1 64
|
3556
3582
|
#define NWARPS_Q4_0_RDNA1 8
|
3583
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3584
|
+
#define MMQ_X_Q4_0_AMPERE 4
|
3585
|
+
#define MMQ_Y_Q4_0_AMPERE 32
|
3586
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3587
|
+
#else
|
3557
3588
|
#define MMQ_X_Q4_0_AMPERE 64
|
3558
3589
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3559
3590
|
#define NWARPS_Q4_0_AMPERE 4
|
3591
|
+
#endif
|
3560
3592
|
#define MMQ_X_Q4_0_PASCAL 64
|
3561
3593
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3562
3594
|
#define NWARPS_Q4_0_PASCAL 8
|
@@ -3615,9 +3647,15 @@ template <bool need_check> static __global__ void
|
|
3615
3647
|
#define MMQ_X_Q4_1_RDNA1 64
|
3616
3648
|
#define MMQ_Y_Q4_1_RDNA1 64
|
3617
3649
|
#define NWARPS_Q4_1_RDNA1 8
|
3650
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3651
|
+
#define MMQ_X_Q4_1_AMPERE 4
|
3652
|
+
#define MMQ_Y_Q4_1_AMPERE 32
|
3653
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3654
|
+
#else
|
3618
3655
|
#define MMQ_X_Q4_1_AMPERE 64
|
3619
3656
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3620
3657
|
#define NWARPS_Q4_1_AMPERE 4
|
3658
|
+
#endif
|
3621
3659
|
#define MMQ_X_Q4_1_PASCAL 64
|
3622
3660
|
#define MMQ_Y_Q4_1_PASCAL 64
|
3623
3661
|
#define NWARPS_Q4_1_PASCAL 8
|
@@ -3678,9 +3716,15 @@ template <bool need_check> static __global__ void
|
|
3678
3716
|
#define MMQ_X_Q5_0_RDNA1 64
|
3679
3717
|
#define MMQ_Y_Q5_0_RDNA1 64
|
3680
3718
|
#define NWARPS_Q5_0_RDNA1 8
|
3719
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3720
|
+
#define MMQ_X_Q5_0_AMPERE 4
|
3721
|
+
#define MMQ_Y_Q5_0_AMPERE 32
|
3722
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3723
|
+
#else
|
3681
3724
|
#define MMQ_X_Q5_0_AMPERE 128
|
3682
3725
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3683
3726
|
#define NWARPS_Q5_0_AMPERE 4
|
3727
|
+
#endif
|
3684
3728
|
#define MMQ_X_Q5_0_PASCAL 64
|
3685
3729
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3686
3730
|
#define NWARPS_Q5_0_PASCAL 8
|
@@ -3739,9 +3783,15 @@ template <bool need_check> static __global__ void
|
|
3739
3783
|
#define MMQ_X_Q5_1_RDNA1 64
|
3740
3784
|
#define MMQ_Y_Q5_1_RDNA1 64
|
3741
3785
|
#define NWARPS_Q5_1_RDNA1 8
|
3786
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3787
|
+
#define MMQ_X_Q5_1_AMPERE 4
|
3788
|
+
#define MMQ_Y_Q5_1_AMPERE 32
|
3789
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3790
|
+
#else
|
3742
3791
|
#define MMQ_X_Q5_1_AMPERE 128
|
3743
3792
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3744
3793
|
#define NWARPS_Q5_1_AMPERE 4
|
3794
|
+
#endif
|
3745
3795
|
#define MMQ_X_Q5_1_PASCAL 64
|
3746
3796
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3747
3797
|
#define NWARPS_Q5_1_PASCAL 8
|
@@ -3800,9 +3850,15 @@ mul_mat_q5_1(
|
|
3800
3850
|
#define MMQ_X_Q8_0_RDNA1 64
|
3801
3851
|
#define MMQ_Y_Q8_0_RDNA1 64
|
3802
3852
|
#define NWARPS_Q8_0_RDNA1 8
|
3853
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3854
|
+
#define MMQ_X_Q8_0_AMPERE 4
|
3855
|
+
#define MMQ_Y_Q8_0_AMPERE 32
|
3856
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3857
|
+
#else
|
3803
3858
|
#define MMQ_X_Q8_0_AMPERE 128
|
3804
3859
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3805
3860
|
#define NWARPS_Q8_0_AMPERE 4
|
3861
|
+
#endif
|
3806
3862
|
#define MMQ_X_Q8_0_PASCAL 64
|
3807
3863
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3808
3864
|
#define NWARPS_Q8_0_PASCAL 8
|
@@ -3861,9 +3917,15 @@ template <bool need_check> static __global__ void
|
|
3861
3917
|
#define MMQ_X_Q2_K_RDNA1 128
|
3862
3918
|
#define MMQ_Y_Q2_K_RDNA1 32
|
3863
3919
|
#define NWARPS_Q2_K_RDNA1 8
|
3920
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3921
|
+
#define MMQ_X_Q2_K_AMPERE 4
|
3922
|
+
#define MMQ_Y_Q2_K_AMPERE 32
|
3923
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3924
|
+
#else
|
3864
3925
|
#define MMQ_X_Q2_K_AMPERE 64
|
3865
3926
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3866
3927
|
#define NWARPS_Q2_K_AMPERE 4
|
3928
|
+
#endif
|
3867
3929
|
#define MMQ_X_Q2_K_PASCAL 64
|
3868
3930
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3869
3931
|
#define NWARPS_Q2_K_PASCAL 8
|
@@ -3922,9 +3984,15 @@ mul_mat_q2_K(
|
|
3922
3984
|
#define MMQ_X_Q3_K_RDNA1 32
|
3923
3985
|
#define MMQ_Y_Q3_K_RDNA1 128
|
3924
3986
|
#define NWARPS_Q3_K_RDNA1 8
|
3987
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3988
|
+
#define MMQ_X_Q3_K_AMPERE 4
|
3989
|
+
#define MMQ_Y_Q3_K_AMPERE 32
|
3990
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3991
|
+
#else
|
3925
3992
|
#define MMQ_X_Q3_K_AMPERE 128
|
3926
3993
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3927
3994
|
#define NWARPS_Q3_K_AMPERE 4
|
3995
|
+
#endif
|
3928
3996
|
#define MMQ_X_Q3_K_PASCAL 64
|
3929
3997
|
#define MMQ_Y_Q3_K_PASCAL 64
|
3930
3998
|
#define NWARPS_Q3_K_PASCAL 8
|
@@ -3985,9 +4053,15 @@ template <bool need_check> static __global__ void
|
|
3985
4053
|
#define MMQ_X_Q4_K_RDNA1 32
|
3986
4054
|
#define MMQ_Y_Q4_K_RDNA1 64
|
3987
4055
|
#define NWARPS_Q4_K_RDNA1 8
|
4056
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4057
|
+
#define MMQ_X_Q4_K_AMPERE 4
|
4058
|
+
#define MMQ_Y_Q4_K_AMPERE 32
|
4059
|
+
#define NWARPS_Q4_K_AMPERE 4
|
4060
|
+
#else
|
3988
4061
|
#define MMQ_X_Q4_K_AMPERE 64
|
3989
4062
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3990
4063
|
#define NWARPS_Q4_K_AMPERE 4
|
4064
|
+
#endif
|
3991
4065
|
#define MMQ_X_Q4_K_PASCAL 64
|
3992
4066
|
#define MMQ_Y_Q4_K_PASCAL 64
|
3993
4067
|
#define NWARPS_Q4_K_PASCAL 8
|
@@ -4048,9 +4122,15 @@ template <bool need_check> static __global__ void
|
|
4048
4122
|
#define MMQ_X_Q5_K_RDNA1 32
|
4049
4123
|
#define MMQ_Y_Q5_K_RDNA1 64
|
4050
4124
|
#define NWARPS_Q5_K_RDNA1 8
|
4125
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4126
|
+
#define MMQ_X_Q5_K_AMPERE 4
|
4127
|
+
#define MMQ_Y_Q5_K_AMPERE 32
|
4128
|
+
#define NWARPS_Q5_K_AMPERE 4
|
4129
|
+
#else
|
4051
4130
|
#define MMQ_X_Q5_K_AMPERE 64
|
4052
4131
|
#define MMQ_Y_Q5_K_AMPERE 128
|
4053
4132
|
#define NWARPS_Q5_K_AMPERE 4
|
4133
|
+
#endif
|
4054
4134
|
#define MMQ_X_Q5_K_PASCAL 64
|
4055
4135
|
#define MMQ_Y_Q5_K_PASCAL 64
|
4056
4136
|
#define NWARPS_Q5_K_PASCAL 8
|
@@ -4109,9 +4189,15 @@ mul_mat_q5_K(
|
|
4109
4189
|
#define MMQ_X_Q6_K_RDNA1 32
|
4110
4190
|
#define MMQ_Y_Q6_K_RDNA1 64
|
4111
4191
|
#define NWARPS_Q6_K_RDNA1 8
|
4192
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4193
|
+
#define MMQ_X_Q6_K_AMPERE 4
|
4194
|
+
#define MMQ_Y_Q6_K_AMPERE 32
|
4195
|
+
#define NWARPS_Q6_K_AMPERE 4
|
4196
|
+
#else
|
4112
4197
|
#define MMQ_X_Q6_K_AMPERE 64
|
4113
4198
|
#define MMQ_Y_Q6_K_AMPERE 64
|
4114
4199
|
#define NWARPS_Q6_K_AMPERE 4
|
4200
|
+
#endif
|
4115
4201
|
#define MMQ_X_Q6_K_PASCAL 64
|
4116
4202
|
#define MMQ_Y_Q6_K_PASCAL 64
|
4117
4203
|
#define NWARPS_Q6_K_PASCAL 8
|
@@ -4407,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4407
4493
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4408
4494
|
}
|
4409
4495
|
|
4410
|
-
|
4496
|
+
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4497
|
+
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4498
|
+
return 1.0f - min(1.0f, max(0.0f, y));
|
4499
|
+
}
|
4500
|
+
|
4501
|
+
struct rope_corr_dims {
|
4502
|
+
float v[4];
|
4503
|
+
};
|
4504
|
+
|
4505
|
+
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
4506
|
+
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
4507
|
+
static __device__ void rope_yarn(
|
4508
|
+
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
4509
|
+
float * cos_theta, float * sin_theta
|
4510
|
+
) {
|
4511
|
+
// Get n-d rotational scaling corrected for extrapolation
|
4512
|
+
float theta_interp = freq_scale * theta_extrap;
|
4513
|
+
float theta = theta_interp;
|
4514
|
+
if (ext_factor != 0.0f) {
|
4515
|
+
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
4516
|
+
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
4517
|
+
|
4518
|
+
// Get n-d magnitude scaling corrected for interpolation
|
4519
|
+
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
4520
|
+
}
|
4521
|
+
*cos_theta = cosf(theta) * mscale;
|
4522
|
+
*sin_theta = sinf(theta) * mscale;
|
4523
|
+
}
|
4411
4524
|
|
4525
|
+
// rope == RoPE == rotary positional embedding
|
4412
4526
|
template<typename T, bool has_pos>
|
4413
|
-
static __global__ void rope(
|
4414
|
-
|
4527
|
+
static __global__ void rope(
|
4528
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4529
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4530
|
+
) {
|
4415
4531
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4416
4532
|
|
4417
4533
|
if (col >= ncols) {
|
@@ -4423,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4423
4539
|
const int i2 = row/p_delta_rows;
|
4424
4540
|
|
4425
4541
|
const int p = has_pos ? pos[i2] : 0;
|
4426
|
-
const float
|
4427
|
-
|
4428
|
-
|
4429
|
-
|
4542
|
+
const float theta_base = p*powf(freq_base, -float(col)/ncols);
|
4543
|
+
|
4544
|
+
float cos_theta, sin_theta;
|
4545
|
+
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4430
4546
|
|
4431
4547
|
const float x0 = x[i + 0];
|
4432
4548
|
const float x1 = x[i + 1];
|
@@ -4436,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4436
4552
|
}
|
4437
4553
|
|
4438
4554
|
template<typename T, bool has_pos>
|
4439
|
-
static __global__ void rope_neox(
|
4440
|
-
|
4555
|
+
static __global__ void rope_neox(
|
4556
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4557
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4558
|
+
) {
|
4441
4559
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4442
4560
|
|
4443
4561
|
if (col >= ncols) {
|
@@ -4448,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4448
4566
|
const int i = row*ncols + col/2;
|
4449
4567
|
const int i2 = row/p_delta_rows;
|
4450
4568
|
|
4569
|
+
// simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
|
4570
|
+
const float cur_rot = -float(col)/ncols;
|
4571
|
+
|
4451
4572
|
const int p = has_pos ? pos[i2] : 0;
|
4452
|
-
const float
|
4453
|
-
|
4454
|
-
|
4455
|
-
|
4573
|
+
const float theta_base = p*powf(freq_base, cur_rot);
|
4574
|
+
|
4575
|
+
float cos_theta, sin_theta;
|
4576
|
+
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4456
4577
|
|
4457
4578
|
const float x0 = x[i + 0];
|
4458
4579
|
const float x1 = x[i + ncols/2];
|
@@ -4461,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4461
4582
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4462
4583
|
}
|
4463
4584
|
|
4464
|
-
static __global__ void rope_glm_f32(
|
4465
|
-
|
4585
|
+
static __global__ void rope_glm_f32(
|
4586
|
+
const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4587
|
+
int n_ctx
|
4588
|
+
) {
|
4466
4589
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4467
4590
|
const int half_n_dims = ncols/4;
|
4468
4591
|
|
@@ -4474,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4474
4597
|
const int i = row*ncols + col;
|
4475
4598
|
const int i2 = row/p_delta_rows;
|
4476
4599
|
|
4477
|
-
const float col_theta_scale = powf(
|
4600
|
+
const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
|
4478
4601
|
// FIXME: this is likely wrong
|
4479
4602
|
const int p = pos != nullptr ? pos[i2] : 0;
|
4480
4603
|
|
@@ -4616,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
|
|
4616
4739
|
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4617
4740
|
}
|
4618
4741
|
|
4742
|
+
static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
4743
|
+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4744
|
+
add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4745
|
+
}
|
4746
|
+
|
4619
4747
|
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4620
4748
|
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
4621
4749
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -5493,40 +5621,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
|
|
5493
5621
|
}
|
5494
5622
|
|
5495
5623
|
template<typename T>
|
5496
|
-
static void rope_cuda(
|
5497
|
-
|
5624
|
+
static void rope_cuda(
|
5625
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5626
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5627
|
+
) {
|
5498
5628
|
GGML_ASSERT(ncols % 2 == 0);
|
5499
5629
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5500
5630
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5501
5631
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5502
5632
|
if (pos == nullptr) {
|
5503
|
-
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5633
|
+
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5634
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5635
|
+
);
|
5504
5636
|
} else {
|
5505
|
-
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5637
|
+
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5638
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5639
|
+
);
|
5506
5640
|
}
|
5507
5641
|
}
|
5508
5642
|
|
5509
5643
|
template<typename T>
|
5510
|
-
static void rope_neox_cuda(
|
5511
|
-
|
5644
|
+
static void rope_neox_cuda(
|
5645
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5646
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5647
|
+
) {
|
5512
5648
|
GGML_ASSERT(ncols % 2 == 0);
|
5513
5649
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5514
5650
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5515
5651
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5516
5652
|
if (pos == nullptr) {
|
5517
|
-
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5653
|
+
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5654
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5655
|
+
);
|
5518
5656
|
} else {
|
5519
|
-
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5657
|
+
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5658
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5659
|
+
);
|
5520
5660
|
}
|
5521
5661
|
}
|
5522
5662
|
|
5523
|
-
static void rope_glm_f32_cuda(
|
5524
|
-
|
5663
|
+
static void rope_glm_f32_cuda(
|
5664
|
+
const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5665
|
+
float freq_base, int n_ctx, cudaStream_t stream
|
5666
|
+
) {
|
5525
5667
|
GGML_ASSERT(ncols % 4 == 0);
|
5526
5668
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5527
5669
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
5528
5670
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
5529
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5671
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
|
5530
5672
|
}
|
5531
5673
|
|
5532
5674
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5663,6 +5805,16 @@ void ggml_init_cublas() {
|
|
5663
5805
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
5664
5806
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5665
5807
|
int64_t total_vram = 0;
|
5808
|
+
#if defined(GGML_CUDA_FORCE_MMQ)
|
5809
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
5810
|
+
#else
|
5811
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
5812
|
+
#endif
|
5813
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
5814
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
5815
|
+
#else
|
5816
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
5817
|
+
#endif
|
5666
5818
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5667
5819
|
for (int id = 0; id < g_device_count; ++id) {
|
5668
5820
|
cudaDeviceProp prop;
|
@@ -5909,7 +6061,10 @@ inline void ggml_cuda_op_add(
|
|
5909
6061
|
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5910
6062
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5911
6063
|
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
6064
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6065
|
+
add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5912
6066
|
} else {
|
6067
|
+
fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
|
5913
6068
|
GGML_ASSERT(false);
|
5914
6069
|
}
|
5915
6070
|
|
@@ -6347,7 +6502,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6347
6502
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6348
6503
|
row_diff, src1_ncols, ne10,
|
6349
6504
|
&alpha, src0_ddf_i, ne00,
|
6350
|
-
src1_ddf_i,
|
6505
|
+
src1_ddf_i, ne10,
|
6351
6506
|
&beta, dst_dd_i, ldc));
|
6352
6507
|
|
6353
6508
|
if (src0_as != 0) {
|
@@ -6373,17 +6528,20 @@ inline void ggml_cuda_op_rope(
|
|
6373
6528
|
const int64_t ne2 = dst->ne[2];
|
6374
6529
|
const int64_t nrows = ggml_nrows(src0);
|
6375
6530
|
|
6376
|
-
//const int n_past
|
6377
|
-
const int n_dims
|
6378
|
-
const int mode
|
6379
|
-
const int n_ctx
|
6380
|
-
|
6381
|
-
|
6382
|
-
float freq_base, freq_scale;
|
6383
|
-
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
6384
|
-
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
6531
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6532
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
6533
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
6534
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
6535
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
6385
6536
|
|
6386
|
-
|
6537
|
+
// RoPE alteration for extended context
|
6538
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
6539
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
6540
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
6541
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
6542
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
6543
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
6544
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
6387
6545
|
|
6388
6546
|
const int32_t * pos = nullptr;
|
6389
6547
|
if ((mode & 1) == 0) {
|
@@ -6395,24 +6553,39 @@ inline void ggml_cuda_op_rope(
|
|
6395
6553
|
const bool is_neox = mode & 2;
|
6396
6554
|
const bool is_glm = mode & 4;
|
6397
6555
|
|
6556
|
+
rope_corr_dims corr_dims;
|
6557
|
+
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
6558
|
+
|
6398
6559
|
// compute
|
6399
6560
|
if (is_glm) {
|
6400
6561
|
GGML_ASSERT(false);
|
6401
|
-
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01,
|
6562
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6402
6563
|
} else if (is_neox) {
|
6403
6564
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6404
6565
|
if (src0->type == GGML_TYPE_F32) {
|
6405
|
-
rope_neox_cuda(
|
6566
|
+
rope_neox_cuda(
|
6567
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6568
|
+
attn_factor, corr_dims, main_stream
|
6569
|
+
);
|
6406
6570
|
} else if (src0->type == GGML_TYPE_F16) {
|
6407
|
-
rope_neox_cuda(
|
6571
|
+
rope_neox_cuda(
|
6572
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6573
|
+
attn_factor, corr_dims, main_stream
|
6574
|
+
);
|
6408
6575
|
} else {
|
6409
6576
|
GGML_ASSERT(false);
|
6410
6577
|
}
|
6411
6578
|
} else {
|
6412
6579
|
if (src0->type == GGML_TYPE_F32) {
|
6413
|
-
rope_cuda(
|
6580
|
+
rope_cuda(
|
6581
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6582
|
+
attn_factor, corr_dims, main_stream
|
6583
|
+
);
|
6414
6584
|
} else if (src0->type == GGML_TYPE_F16) {
|
6415
|
-
rope_cuda(
|
6585
|
+
rope_cuda(
|
6586
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6587
|
+
attn_factor, corr_dims, main_stream
|
6588
|
+
);
|
6416
6589
|
} else {
|
6417
6590
|
GGML_ASSERT(false);
|
6418
6591
|
}
|
@@ -6523,8 +6696,10 @@ inline void ggml_cuda_op_clamp(
|
|
6523
6696
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6524
6697
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6525
6698
|
|
6526
|
-
|
6527
|
-
|
6699
|
+
float min;
|
6700
|
+
float max;
|
6701
|
+
memcpy(&min, dst->op_params, sizeof(float));
|
6702
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
6528
6703
|
|
6529
6704
|
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6530
6705
|
CUDA_CHECK(cudaGetLastError());
|
@@ -7048,9 +7223,34 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7048
7223
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7049
7224
|
}
|
7050
7225
|
|
7051
|
-
|
7226
|
+
__global__ void k_compute_batched_ptrs(
|
7227
|
+
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7228
|
+
void ** ptrs,
|
7229
|
+
int ne12, int ne13,
|
7230
|
+
int ne23,
|
7231
|
+
int nb02, int nb03,
|
7232
|
+
int nb12, int nb13,
|
7233
|
+
int nb2, int nb3,
|
7234
|
+
int r2, int r3) {
|
7235
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
7236
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
7237
|
+
|
7238
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
7239
|
+
return;
|
7240
|
+
}
|
7241
|
+
|
7242
|
+
int i03 = i13 / r3;
|
7243
|
+
int i02 = i12 / r2;
|
7244
|
+
|
7245
|
+
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7246
|
+
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7247
|
+
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7248
|
+
}
|
7249
|
+
|
7250
|
+
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7052
7251
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
7053
7252
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
7253
|
+
|
7054
7254
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7055
7255
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
7056
7256
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -7148,49 +7348,35 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7148
7348
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7149
7349
|
} else {
|
7150
7350
|
// use cublasGemmBatchedEx
|
7151
|
-
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
7152
7351
|
const int ne23 = ne12*ne13;
|
7153
7352
|
|
7154
|
-
// TODO: avoid this alloc
|
7155
|
-
void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
|
7156
|
-
|
7157
|
-
for (int i13 = 0; i13 < ne13; ++i13) {
|
7158
|
-
for (int i12 = 0; i12 < ne12; ++i12) {
|
7159
|
-
int i03 = i13 / r3;
|
7160
|
-
int i02 = i12 / r2;
|
7161
|
-
|
7162
|
-
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
7163
|
-
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
7164
|
-
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
7165
|
-
}
|
7166
|
-
}
|
7167
|
-
|
7168
|
-
// allocate device memory for pointers
|
7169
7353
|
void ** ptrs_as = nullptr;
|
7170
|
-
|
7171
|
-
|
7172
|
-
|
7173
|
-
|
7174
|
-
|
7175
|
-
|
7176
|
-
|
7177
|
-
|
7178
|
-
|
7179
|
-
|
7354
|
+
size_t ptrs_s = 0;
|
7355
|
+
ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
7356
|
+
|
7357
|
+
dim3 block_dims(ne13, ne12);
|
7358
|
+
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7359
|
+
src0_as_f16, src1_as_f16, dst_f16,
|
7360
|
+
ptrs_as,
|
7361
|
+
ne12, ne13,
|
7362
|
+
ne23,
|
7363
|
+
nb02, nb03,
|
7364
|
+
nb12, nb13,
|
7365
|
+
dst->nb[2], dst->nb[3],
|
7366
|
+
r2, r3);
|
7367
|
+
CUDA_CHECK(cudaGetLastError());
|
7180
7368
|
|
7181
7369
|
CUBLAS_CHECK(
|
7182
7370
|
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7183
7371
|
ne01, ne11, ne10,
|
7184
|
-
&alpha_f16, (const void
|
7185
|
-
(const void
|
7186
|
-
&beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
7372
|
+
&alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7373
|
+
(const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7374
|
+
&beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
7187
7375
|
ne23,
|
7188
7376
|
CUBLAS_COMPUTE_16F,
|
7189
7377
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7190
7378
|
|
7191
|
-
|
7192
|
-
CUDA_CHECK(cudaFree(ptrs_as));
|
7193
|
-
//ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
7379
|
+
ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
7194
7380
|
}
|
7195
7381
|
#endif
|
7196
7382
|
|
@@ -7202,17 +7388,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7202
7388
|
}
|
7203
7389
|
|
7204
7390
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7205
|
-
bool all_on_device =
|
7206
|
-
|
7391
|
+
const bool all_on_device =
|
7392
|
+
(src0->backend == GGML_BACKEND_GPU) &&
|
7393
|
+
(src1->backend == GGML_BACKEND_GPU) &&
|
7394
|
+
( dst->backend == GGML_BACKEND_GPU);
|
7207
7395
|
|
7208
7396
|
int64_t min_compute_capability = INT_MAX;
|
7209
7397
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7210
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
7211
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7398
|
+
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7212
7399
|
min_compute_capability = g_compute_capabilities[id];
|
7213
7400
|
}
|
7214
7401
|
}
|
7215
7402
|
|
7403
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
7404
|
+
const bool use_tensor_cores = true;
|
7405
|
+
#else
|
7406
|
+
const bool use_tensor_cores = false;
|
7407
|
+
#endif
|
7408
|
+
|
7216
7409
|
// debug helpers
|
7217
7410
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
7218
7411
|
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
@@ -7221,20 +7414,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7221
7414
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7222
7415
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7223
7416
|
|
7224
|
-
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7417
|
+
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7225
7418
|
// KQ single-batch
|
7226
7419
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7227
|
-
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7420
|
+
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7228
7421
|
// KQV single-batch
|
7229
7422
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7230
|
-
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)
|
7423
|
+
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7231
7424
|
// KQ + KQV multi-batch
|
7232
7425
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7233
7426
|
} else if (src0->type == GGML_TYPE_F32) {
|
7234
7427
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7235
7428
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
7236
7429
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
7237
|
-
|
7238
7430
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7239
7431
|
const bool use_mul_mat_vec_q = false;
|
7240
7432
|
#else
|
@@ -7247,7 +7439,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7247
7439
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
7248
7440
|
}
|
7249
7441
|
} else {
|
7250
|
-
|
7442
|
+
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
7443
|
+
|
7444
|
+
// when tensor cores are available, use them for large batch size
|
7445
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
7446
|
+
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
7447
|
+
use_mul_mat_q = false;
|
7448
|
+
}
|
7449
|
+
|
7450
|
+
if (use_mul_mat_q) {
|
7251
7451
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
7252
7452
|
} else {
|
7253
7453
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
@@ -7601,10 +7801,6 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
7601
7801
|
}
|
7602
7802
|
}
|
7603
7803
|
|
7604
|
-
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
7605
|
-
g_mul_mat_q = mul_mat_q;
|
7606
|
-
}
|
7607
|
-
|
7608
7804
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7609
7805
|
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
7610
7806
|
// it still won't always work as expected, but it's better than nothing
|