llama_cpp 0.9.0 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-alloc.c +12 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +353 -119
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +60 -38
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +1004 -3572
- data/ext/llama_cpp/src/ggml.h +30 -4
- data/ext/llama_cpp/src/llama.cpp +1945 -2648
- data/ext/llama_cpp/src/llama.h +37 -17
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
@@ -87,6 +87,24 @@
|
|
87
87
|
#define CC_OFFSET_AMD 1000000
|
88
88
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
89
89
|
|
90
|
+
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
|
+
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
|
+
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
93
|
+
// - 7B quantum model: +100-200 MB
|
94
|
+
// - 13B quantum model: +200-400 MB
|
95
|
+
//
|
96
|
+
//#define GGML_CUDA_FORCE_MMQ
|
97
|
+
|
98
|
+
// TODO: improve this to be correct for more hardware
|
99
|
+
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
100
|
+
// probably other such cases, and not sure what happens on AMD hardware
|
101
|
+
#if !defined(GGML_CUDA_FORCE_MMQ)
|
102
|
+
#define CUDA_USE_TENSOR_CORES
|
103
|
+
#endif
|
104
|
+
|
105
|
+
// max batch size to use MMQ kernels when tensor cores are available
|
106
|
+
#define MMQ_MAX_BATCH_SIZE 32
|
107
|
+
|
90
108
|
#if defined(GGML_USE_HIPBLAS)
|
91
109
|
#define __CUDA_ARCH__ 1300
|
92
110
|
|
@@ -470,7 +488,6 @@ static int g_device_count = -1;
|
|
470
488
|
static int g_main_device = 0;
|
471
489
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
472
490
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
473
|
-
static bool g_mul_mat_q = true;
|
474
491
|
|
475
492
|
static void * g_scratch_buffer = nullptr;
|
476
493
|
static size_t g_scratch_size = 0; // disabled by default
|
@@ -496,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
|
|
496
513
|
dst[i] = __hadd(x[i], __float2half(y[i]));
|
497
514
|
}
|
498
515
|
|
516
|
+
static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
|
517
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
518
|
+
|
519
|
+
if (i >= k) {
|
520
|
+
return;
|
521
|
+
}
|
522
|
+
dst[i] = __half2float(x[i]) + y[i];
|
523
|
+
}
|
524
|
+
|
499
525
|
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
500
526
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
501
527
|
|
@@ -956,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
956
982
|
|
957
983
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
958
984
|
|
959
|
-
const int row = blockIdx.
|
985
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
960
986
|
if (row > nrows) return;
|
961
987
|
|
962
988
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1060,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
1060
1086
|
|
1061
1087
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1062
1088
|
|
1063
|
-
const int row = blockIdx.
|
1089
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1064
1090
|
if (row > nrows) return;
|
1065
1091
|
|
1066
1092
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1164,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|
1164
1190
|
|
1165
1191
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1166
1192
|
|
1167
|
-
const int row = blockIdx.
|
1193
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1168
1194
|
if (row > nrows) return;
|
1169
1195
|
const int num_blocks_per_row = ncols / QK_K;
|
1170
1196
|
const int ib0 = row*num_blocks_per_row;
|
@@ -1418,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
1418
1444
|
|
1419
1445
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1420
1446
|
|
1421
|
-
const int row = blockIdx.
|
1447
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1422
1448
|
if (row > nrows) return;
|
1423
1449
|
|
1424
1450
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -3554,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3554
3580
|
#define MMQ_X_Q4_0_RDNA1 64
|
3555
3581
|
#define MMQ_Y_Q4_0_RDNA1 64
|
3556
3582
|
#define NWARPS_Q4_0_RDNA1 8
|
3583
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3584
|
+
#define MMQ_X_Q4_0_AMPERE 4
|
3585
|
+
#define MMQ_Y_Q4_0_AMPERE 32
|
3586
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3587
|
+
#else
|
3557
3588
|
#define MMQ_X_Q4_0_AMPERE 64
|
3558
3589
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3559
3590
|
#define NWARPS_Q4_0_AMPERE 4
|
3591
|
+
#endif
|
3560
3592
|
#define MMQ_X_Q4_0_PASCAL 64
|
3561
3593
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3562
3594
|
#define NWARPS_Q4_0_PASCAL 8
|
@@ -3615,9 +3647,15 @@ template <bool need_check> static __global__ void
|
|
3615
3647
|
#define MMQ_X_Q4_1_RDNA1 64
|
3616
3648
|
#define MMQ_Y_Q4_1_RDNA1 64
|
3617
3649
|
#define NWARPS_Q4_1_RDNA1 8
|
3650
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3651
|
+
#define MMQ_X_Q4_1_AMPERE 4
|
3652
|
+
#define MMQ_Y_Q4_1_AMPERE 32
|
3653
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3654
|
+
#else
|
3618
3655
|
#define MMQ_X_Q4_1_AMPERE 64
|
3619
3656
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3620
3657
|
#define NWARPS_Q4_1_AMPERE 4
|
3658
|
+
#endif
|
3621
3659
|
#define MMQ_X_Q4_1_PASCAL 64
|
3622
3660
|
#define MMQ_Y_Q4_1_PASCAL 64
|
3623
3661
|
#define NWARPS_Q4_1_PASCAL 8
|
@@ -3678,9 +3716,15 @@ template <bool need_check> static __global__ void
|
|
3678
3716
|
#define MMQ_X_Q5_0_RDNA1 64
|
3679
3717
|
#define MMQ_Y_Q5_0_RDNA1 64
|
3680
3718
|
#define NWARPS_Q5_0_RDNA1 8
|
3719
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3720
|
+
#define MMQ_X_Q5_0_AMPERE 4
|
3721
|
+
#define MMQ_Y_Q5_0_AMPERE 32
|
3722
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3723
|
+
#else
|
3681
3724
|
#define MMQ_X_Q5_0_AMPERE 128
|
3682
3725
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3683
3726
|
#define NWARPS_Q5_0_AMPERE 4
|
3727
|
+
#endif
|
3684
3728
|
#define MMQ_X_Q5_0_PASCAL 64
|
3685
3729
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3686
3730
|
#define NWARPS_Q5_0_PASCAL 8
|
@@ -3739,9 +3783,15 @@ template <bool need_check> static __global__ void
|
|
3739
3783
|
#define MMQ_X_Q5_1_RDNA1 64
|
3740
3784
|
#define MMQ_Y_Q5_1_RDNA1 64
|
3741
3785
|
#define NWARPS_Q5_1_RDNA1 8
|
3786
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3787
|
+
#define MMQ_X_Q5_1_AMPERE 4
|
3788
|
+
#define MMQ_Y_Q5_1_AMPERE 32
|
3789
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3790
|
+
#else
|
3742
3791
|
#define MMQ_X_Q5_1_AMPERE 128
|
3743
3792
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3744
3793
|
#define NWARPS_Q5_1_AMPERE 4
|
3794
|
+
#endif
|
3745
3795
|
#define MMQ_X_Q5_1_PASCAL 64
|
3746
3796
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3747
3797
|
#define NWARPS_Q5_1_PASCAL 8
|
@@ -3800,9 +3850,15 @@ mul_mat_q5_1(
|
|
3800
3850
|
#define MMQ_X_Q8_0_RDNA1 64
|
3801
3851
|
#define MMQ_Y_Q8_0_RDNA1 64
|
3802
3852
|
#define NWARPS_Q8_0_RDNA1 8
|
3853
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3854
|
+
#define MMQ_X_Q8_0_AMPERE 4
|
3855
|
+
#define MMQ_Y_Q8_0_AMPERE 32
|
3856
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3857
|
+
#else
|
3803
3858
|
#define MMQ_X_Q8_0_AMPERE 128
|
3804
3859
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3805
3860
|
#define NWARPS_Q8_0_AMPERE 4
|
3861
|
+
#endif
|
3806
3862
|
#define MMQ_X_Q8_0_PASCAL 64
|
3807
3863
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3808
3864
|
#define NWARPS_Q8_0_PASCAL 8
|
@@ -3861,9 +3917,15 @@ template <bool need_check> static __global__ void
|
|
3861
3917
|
#define MMQ_X_Q2_K_RDNA1 128
|
3862
3918
|
#define MMQ_Y_Q2_K_RDNA1 32
|
3863
3919
|
#define NWARPS_Q2_K_RDNA1 8
|
3920
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3921
|
+
#define MMQ_X_Q2_K_AMPERE 4
|
3922
|
+
#define MMQ_Y_Q2_K_AMPERE 32
|
3923
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3924
|
+
#else
|
3864
3925
|
#define MMQ_X_Q2_K_AMPERE 64
|
3865
3926
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3866
3927
|
#define NWARPS_Q2_K_AMPERE 4
|
3928
|
+
#endif
|
3867
3929
|
#define MMQ_X_Q2_K_PASCAL 64
|
3868
3930
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3869
3931
|
#define NWARPS_Q2_K_PASCAL 8
|
@@ -3922,9 +3984,15 @@ mul_mat_q2_K(
|
|
3922
3984
|
#define MMQ_X_Q3_K_RDNA1 32
|
3923
3985
|
#define MMQ_Y_Q3_K_RDNA1 128
|
3924
3986
|
#define NWARPS_Q3_K_RDNA1 8
|
3987
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3988
|
+
#define MMQ_X_Q3_K_AMPERE 4
|
3989
|
+
#define MMQ_Y_Q3_K_AMPERE 32
|
3990
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3991
|
+
#else
|
3925
3992
|
#define MMQ_X_Q3_K_AMPERE 128
|
3926
3993
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3927
3994
|
#define NWARPS_Q3_K_AMPERE 4
|
3995
|
+
#endif
|
3928
3996
|
#define MMQ_X_Q3_K_PASCAL 64
|
3929
3997
|
#define MMQ_Y_Q3_K_PASCAL 64
|
3930
3998
|
#define NWARPS_Q3_K_PASCAL 8
|
@@ -3985,9 +4053,15 @@ template <bool need_check> static __global__ void
|
|
3985
4053
|
#define MMQ_X_Q4_K_RDNA1 32
|
3986
4054
|
#define MMQ_Y_Q4_K_RDNA1 64
|
3987
4055
|
#define NWARPS_Q4_K_RDNA1 8
|
4056
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4057
|
+
#define MMQ_X_Q4_K_AMPERE 4
|
4058
|
+
#define MMQ_Y_Q4_K_AMPERE 32
|
4059
|
+
#define NWARPS_Q4_K_AMPERE 4
|
4060
|
+
#else
|
3988
4061
|
#define MMQ_X_Q4_K_AMPERE 64
|
3989
4062
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3990
4063
|
#define NWARPS_Q4_K_AMPERE 4
|
4064
|
+
#endif
|
3991
4065
|
#define MMQ_X_Q4_K_PASCAL 64
|
3992
4066
|
#define MMQ_Y_Q4_K_PASCAL 64
|
3993
4067
|
#define NWARPS_Q4_K_PASCAL 8
|
@@ -4048,9 +4122,15 @@ template <bool need_check> static __global__ void
|
|
4048
4122
|
#define MMQ_X_Q5_K_RDNA1 32
|
4049
4123
|
#define MMQ_Y_Q5_K_RDNA1 64
|
4050
4124
|
#define NWARPS_Q5_K_RDNA1 8
|
4125
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4126
|
+
#define MMQ_X_Q5_K_AMPERE 4
|
4127
|
+
#define MMQ_Y_Q5_K_AMPERE 32
|
4128
|
+
#define NWARPS_Q5_K_AMPERE 4
|
4129
|
+
#else
|
4051
4130
|
#define MMQ_X_Q5_K_AMPERE 64
|
4052
4131
|
#define MMQ_Y_Q5_K_AMPERE 128
|
4053
4132
|
#define NWARPS_Q5_K_AMPERE 4
|
4133
|
+
#endif
|
4054
4134
|
#define MMQ_X_Q5_K_PASCAL 64
|
4055
4135
|
#define MMQ_Y_Q5_K_PASCAL 64
|
4056
4136
|
#define NWARPS_Q5_K_PASCAL 8
|
@@ -4109,9 +4189,15 @@ mul_mat_q5_K(
|
|
4109
4189
|
#define MMQ_X_Q6_K_RDNA1 32
|
4110
4190
|
#define MMQ_Y_Q6_K_RDNA1 64
|
4111
4191
|
#define NWARPS_Q6_K_RDNA1 8
|
4192
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4193
|
+
#define MMQ_X_Q6_K_AMPERE 4
|
4194
|
+
#define MMQ_Y_Q6_K_AMPERE 32
|
4195
|
+
#define NWARPS_Q6_K_AMPERE 4
|
4196
|
+
#else
|
4112
4197
|
#define MMQ_X_Q6_K_AMPERE 64
|
4113
4198
|
#define MMQ_Y_Q6_K_AMPERE 64
|
4114
4199
|
#define NWARPS_Q6_K_AMPERE 4
|
4200
|
+
#endif
|
4115
4201
|
#define MMQ_X_Q6_K_PASCAL 64
|
4116
4202
|
#define MMQ_Y_Q6_K_PASCAL 64
|
4117
4203
|
#define NWARPS_Q6_K_PASCAL 8
|
@@ -4168,7 +4254,7 @@ template <bool need_check> static __global__ void
|
|
4168
4254
|
|
4169
4255
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
4170
4256
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
4171
|
-
const int row = blockIdx.
|
4257
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4172
4258
|
|
4173
4259
|
if (row >= nrows) {
|
4174
4260
|
return;
|
@@ -4208,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|
4208
4294
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
4209
4295
|
// qk = quantized weights per x block
|
4210
4296
|
// qr = number of quantized weights per data value in x block
|
4211
|
-
const int row = blockIdx.
|
4297
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4212
4298
|
|
4213
4299
|
if (row >= nrows) {
|
4214
4300
|
return;
|
@@ -4407,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4407
4493
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4408
4494
|
}
|
4409
4495
|
|
4410
|
-
|
4496
|
+
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4497
|
+
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4498
|
+
return 1.0f - min(1.0f, max(0.0f, y));
|
4499
|
+
}
|
4500
|
+
|
4501
|
+
struct rope_corr_dims {
|
4502
|
+
float v[4];
|
4503
|
+
};
|
4504
|
+
|
4505
|
+
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
4506
|
+
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
4507
|
+
static __device__ void rope_yarn(
|
4508
|
+
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
4509
|
+
float * cos_theta, float * sin_theta
|
4510
|
+
) {
|
4511
|
+
// Get n-d rotational scaling corrected for extrapolation
|
4512
|
+
float theta_interp = freq_scale * theta_extrap;
|
4513
|
+
float theta = theta_interp;
|
4514
|
+
if (ext_factor != 0.0f) {
|
4515
|
+
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
4516
|
+
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
4411
4517
|
|
4518
|
+
// Get n-d magnitude scaling corrected for interpolation
|
4519
|
+
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
4520
|
+
}
|
4521
|
+
*cos_theta = cosf(theta) * mscale;
|
4522
|
+
*sin_theta = sinf(theta) * mscale;
|
4523
|
+
}
|
4524
|
+
|
4525
|
+
// rope == RoPE == rotary positional embedding
|
4412
4526
|
template<typename T, bool has_pos>
|
4413
|
-
static __global__ void rope(
|
4414
|
-
|
4527
|
+
static __global__ void rope(
|
4528
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4529
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4530
|
+
) {
|
4415
4531
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4416
4532
|
|
4417
4533
|
if (col >= ncols) {
|
@@ -4423,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4423
4539
|
const int i2 = row/p_delta_rows;
|
4424
4540
|
|
4425
4541
|
const int p = has_pos ? pos[i2] : 0;
|
4426
|
-
const float
|
4427
|
-
|
4428
|
-
|
4429
|
-
|
4542
|
+
const float theta_base = p*powf(freq_base, -float(col)/ncols);
|
4543
|
+
|
4544
|
+
float cos_theta, sin_theta;
|
4545
|
+
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4430
4546
|
|
4431
4547
|
const float x0 = x[i + 0];
|
4432
4548
|
const float x1 = x[i + 1];
|
@@ -4436,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4436
4552
|
}
|
4437
4553
|
|
4438
4554
|
template<typename T, bool has_pos>
|
4439
|
-
static __global__ void rope_neox(
|
4440
|
-
|
4555
|
+
static __global__ void rope_neox(
|
4556
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4557
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4558
|
+
) {
|
4441
4559
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4442
4560
|
|
4443
4561
|
if (col >= ncols) {
|
@@ -4448,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4448
4566
|
const int i = row*ncols + col/2;
|
4449
4567
|
const int i2 = row/p_delta_rows;
|
4450
4568
|
|
4569
|
+
// simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
|
4570
|
+
const float cur_rot = -float(col)/ncols;
|
4571
|
+
|
4451
4572
|
const int p = has_pos ? pos[i2] : 0;
|
4452
|
-
const float
|
4453
|
-
|
4454
|
-
|
4455
|
-
|
4573
|
+
const float theta_base = p*powf(freq_base, cur_rot);
|
4574
|
+
|
4575
|
+
float cos_theta, sin_theta;
|
4576
|
+
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4456
4577
|
|
4457
4578
|
const float x0 = x[i + 0];
|
4458
4579
|
const float x1 = x[i + ncols/2];
|
@@ -4461,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4461
4582
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4462
4583
|
}
|
4463
4584
|
|
4464
|
-
static __global__ void rope_glm_f32(
|
4465
|
-
|
4585
|
+
static __global__ void rope_glm_f32(
|
4586
|
+
const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4587
|
+
int n_ctx
|
4588
|
+
) {
|
4466
4589
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4467
4590
|
const int half_n_dims = ncols/4;
|
4468
4591
|
|
@@ -4474,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4474
4597
|
const int i = row*ncols + col;
|
4475
4598
|
const int i2 = row/p_delta_rows;
|
4476
4599
|
|
4477
|
-
const float col_theta_scale = powf(
|
4600
|
+
const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
|
4478
4601
|
// FIXME: this is likely wrong
|
4479
4602
|
const int p = pos != nullptr ? pos[i2] : 0;
|
4480
4603
|
|
@@ -4616,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
|
|
4616
4739
|
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4617
4740
|
}
|
4618
4741
|
|
4742
|
+
static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
4743
|
+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4744
|
+
add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4745
|
+
}
|
4746
|
+
|
4619
4747
|
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4620
4748
|
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
4621
4749
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -4739,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4739
4867
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4740
4868
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4741
4869
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4742
|
-
|
4870
|
+
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
4871
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4743
4872
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4744
4873
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
4745
4874
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4748,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
|
4748
4877
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4749
4878
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4750
4879
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4751
|
-
const dim3 block_nums(
|
4880
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4752
4881
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4753
4882
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
4754
4883
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4757,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
|
4757
4886
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4758
4887
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4759
4888
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4760
|
-
const dim3 block_nums(
|
4889
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4761
4890
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4762
4891
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
4763
4892
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4766,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
|
4766
4895
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4767
4896
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4768
4897
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4769
|
-
const dim3 block_nums(
|
4898
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4770
4899
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4771
4900
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
4772
4901
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4775,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
|
4775
4904
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4776
4905
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4777
4906
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4778
|
-
const dim3 block_nums(
|
4907
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4779
4908
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4780
4909
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
4781
4910
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4785,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
|
4785
4914
|
GGML_ASSERT(ncols % QK_K == 0);
|
4786
4915
|
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
4787
4916
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4788
|
-
const dim3 block_nums(
|
4917
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4789
4918
|
const dim3 block_dims(32, ny, 1);
|
4790
4919
|
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4791
4920
|
}
|
@@ -4794,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
|
4794
4923
|
GGML_ASSERT(ncols % QK_K == 0);
|
4795
4924
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4796
4925
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4797
|
-
const dim3 block_nums(
|
4926
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4798
4927
|
const dim3 block_dims(32, ny, 1);
|
4799
4928
|
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4800
4929
|
}
|
@@ -4803,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
|
|
4803
4932
|
GGML_ASSERT(ncols % QK_K == 0);
|
4804
4933
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4805
4934
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4806
|
-
const dim3 block_nums(
|
4935
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4807
4936
|
const dim3 block_dims(32, ny, 1);
|
4808
4937
|
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4809
4938
|
}
|
@@ -4818,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4818
4947
|
GGML_ASSERT(ncols % QK_K == 0);
|
4819
4948
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4820
4949
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4821
|
-
const dim3 block_nums(
|
4950
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4822
4951
|
const dim3 block_dims(32, ny, 1);
|
4823
4952
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4824
4953
|
}
|
@@ -4826,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4826
4955
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4827
4956
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
4828
4957
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4829
|
-
const dim3 block_nums(
|
4958
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4830
4959
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4831
4960
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
4832
4961
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4835,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4835
4964
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4836
4965
|
GGML_ASSERT(ncols % QK4_1 == 0);
|
4837
4966
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4838
|
-
const dim3 block_nums(
|
4967
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4839
4968
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4840
4969
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
4841
4970
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4844,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4844
4973
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4845
4974
|
GGML_ASSERT(ncols % QK5_0 == 0);
|
4846
4975
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4847
|
-
const dim3 block_nums(
|
4976
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4848
4977
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4849
4978
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
4850
4979
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4853,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4853
4982
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4854
4983
|
GGML_ASSERT(ncols % QK5_1 == 0);
|
4855
4984
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4856
|
-
const dim3 block_nums(
|
4985
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4857
4986
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4858
4987
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
4859
4988
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4862,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4862
4991
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4863
4992
|
GGML_ASSERT(ncols % QK8_0 == 0);
|
4864
4993
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4865
|
-
const dim3 block_nums(
|
4994
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4866
4995
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4867
4996
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
4868
4997
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4871,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4871
5000
|
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4872
5001
|
GGML_ASSERT(ncols % QK_K == 0);
|
4873
5002
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4874
|
-
const dim3 block_nums(
|
5003
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4875
5004
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4876
5005
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
4877
5006
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4880,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4880
5009
|
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4881
5010
|
GGML_ASSERT(ncols % QK_K == 0);
|
4882
5011
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4883
|
-
const dim3 block_nums(
|
5012
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4884
5013
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4885
5014
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
4886
5015
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4889,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4889
5018
|
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4890
5019
|
GGML_ASSERT(ncols % QK_K == 0);
|
4891
5020
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4892
|
-
const dim3 block_nums(
|
5021
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4893
5022
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4894
5023
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
4895
5024
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4898,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4898
5027
|
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4899
5028
|
GGML_ASSERT(ncols % QK_K == 0);
|
4900
5029
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4901
|
-
const dim3 block_nums(
|
5030
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4902
5031
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4903
5032
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
4904
5033
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4907,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4907
5036
|
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4908
5037
|
GGML_ASSERT(ncols % QK_K == 0);
|
4909
5038
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4910
|
-
const dim3 block_nums(
|
5039
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4911
5040
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4912
5041
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
4913
5042
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4926,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
|
|
4926
5055
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4927
5056
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4928
5057
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4929
|
-
const dim3 block_nums(
|
5058
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4930
5059
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4931
5060
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
4932
5061
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -5493,40 +5622,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
|
|
5493
5622
|
}
|
5494
5623
|
|
5495
5624
|
template<typename T>
|
5496
|
-
static void rope_cuda(
|
5497
|
-
|
5625
|
+
static void rope_cuda(
|
5626
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5627
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5628
|
+
) {
|
5498
5629
|
GGML_ASSERT(ncols % 2 == 0);
|
5499
5630
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5500
5631
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5501
5632
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5502
5633
|
if (pos == nullptr) {
|
5503
|
-
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5634
|
+
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5635
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5636
|
+
);
|
5504
5637
|
} else {
|
5505
|
-
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5638
|
+
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5639
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5640
|
+
);
|
5506
5641
|
}
|
5507
5642
|
}
|
5508
5643
|
|
5509
5644
|
template<typename T>
|
5510
|
-
static void rope_neox_cuda(
|
5511
|
-
|
5645
|
+
static void rope_neox_cuda(
|
5646
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5647
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5648
|
+
) {
|
5512
5649
|
GGML_ASSERT(ncols % 2 == 0);
|
5513
5650
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5514
5651
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5515
5652
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5516
5653
|
if (pos == nullptr) {
|
5517
|
-
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5654
|
+
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5655
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5656
|
+
);
|
5518
5657
|
} else {
|
5519
|
-
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5658
|
+
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5659
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5660
|
+
);
|
5520
5661
|
}
|
5521
5662
|
}
|
5522
5663
|
|
5523
|
-
static void rope_glm_f32_cuda(
|
5524
|
-
|
5664
|
+
static void rope_glm_f32_cuda(
|
5665
|
+
const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5666
|
+
float freq_base, int n_ctx, cudaStream_t stream
|
5667
|
+
) {
|
5525
5668
|
GGML_ASSERT(ncols % 4 == 0);
|
5526
5669
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5527
5670
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
5528
5671
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
5529
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5672
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
|
5530
5673
|
}
|
5531
5674
|
|
5532
5675
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5647,6 +5790,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
5647
5790
|
CUDA_CHECK(cudaFree(ptr));
|
5648
5791
|
}
|
5649
5792
|
|
5793
|
+
static bool g_cublas_loaded = false;
|
5794
|
+
|
5795
|
+
bool ggml_cublas_loaded(void) {
|
5796
|
+
return g_cublas_loaded;
|
5797
|
+
}
|
5650
5798
|
|
5651
5799
|
void ggml_init_cublas() {
|
5652
5800
|
static bool initialized = false;
|
@@ -5660,9 +5808,24 @@ void ggml_init_cublas() {
|
|
5660
5808
|
CUDA_CHECK(cudaDeviceSynchronize());
|
5661
5809
|
#endif
|
5662
5810
|
|
5663
|
-
|
5811
|
+
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
5812
|
+
initialized = true;
|
5813
|
+
g_cublas_loaded = false;
|
5814
|
+
return;
|
5815
|
+
}
|
5816
|
+
|
5664
5817
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5665
5818
|
int64_t total_vram = 0;
|
5819
|
+
#if defined(GGML_CUDA_FORCE_MMQ)
|
5820
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
5821
|
+
#else
|
5822
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
5823
|
+
#endif
|
5824
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
5825
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
5826
|
+
#else
|
5827
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
5828
|
+
#endif
|
5666
5829
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5667
5830
|
for (int id = 0; id < g_device_count; ++id) {
|
5668
5831
|
cudaDeviceProp prop;
|
@@ -5698,6 +5861,7 @@ void ggml_init_cublas() {
|
|
5698
5861
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
5699
5862
|
|
5700
5863
|
initialized = true;
|
5864
|
+
g_cublas_loaded = true;
|
5701
5865
|
}
|
5702
5866
|
}
|
5703
5867
|
|
@@ -5909,7 +6073,10 @@ inline void ggml_cuda_op_add(
|
|
5909
6073
|
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5910
6074
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5911
6075
|
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
6076
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6077
|
+
add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5912
6078
|
} else {
|
6079
|
+
fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
|
5913
6080
|
GGML_ASSERT(false);
|
5914
6081
|
}
|
5915
6082
|
|
@@ -6347,7 +6514,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6347
6514
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6348
6515
|
row_diff, src1_ncols, ne10,
|
6349
6516
|
&alpha, src0_ddf_i, ne00,
|
6350
|
-
src1_ddf_i,
|
6517
|
+
src1_ddf_i, ne10,
|
6351
6518
|
&beta, dst_dd_i, ldc));
|
6352
6519
|
|
6353
6520
|
if (src0_as != 0) {
|
@@ -6373,17 +6540,20 @@ inline void ggml_cuda_op_rope(
|
|
6373
6540
|
const int64_t ne2 = dst->ne[2];
|
6374
6541
|
const int64_t nrows = ggml_nrows(src0);
|
6375
6542
|
|
6376
|
-
//const int n_past
|
6377
|
-
const int n_dims
|
6378
|
-
const int mode
|
6379
|
-
const int n_ctx
|
6380
|
-
|
6381
|
-
|
6382
|
-
float freq_base, freq_scale;
|
6383
|
-
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
6384
|
-
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
6543
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6544
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
6545
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
6546
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
6547
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
6385
6548
|
|
6386
|
-
|
6549
|
+
// RoPE alteration for extended context
|
6550
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
6551
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
6552
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
6553
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
6554
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
6555
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
6556
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
6387
6557
|
|
6388
6558
|
const int32_t * pos = nullptr;
|
6389
6559
|
if ((mode & 1) == 0) {
|
@@ -6395,24 +6565,39 @@ inline void ggml_cuda_op_rope(
|
|
6395
6565
|
const bool is_neox = mode & 2;
|
6396
6566
|
const bool is_glm = mode & 4;
|
6397
6567
|
|
6568
|
+
rope_corr_dims corr_dims;
|
6569
|
+
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
6570
|
+
|
6398
6571
|
// compute
|
6399
6572
|
if (is_glm) {
|
6400
6573
|
GGML_ASSERT(false);
|
6401
|
-
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01,
|
6574
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6402
6575
|
} else if (is_neox) {
|
6403
6576
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6404
6577
|
if (src0->type == GGML_TYPE_F32) {
|
6405
|
-
rope_neox_cuda(
|
6578
|
+
rope_neox_cuda(
|
6579
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6580
|
+
attn_factor, corr_dims, main_stream
|
6581
|
+
);
|
6406
6582
|
} else if (src0->type == GGML_TYPE_F16) {
|
6407
|
-
rope_neox_cuda(
|
6583
|
+
rope_neox_cuda(
|
6584
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6585
|
+
attn_factor, corr_dims, main_stream
|
6586
|
+
);
|
6408
6587
|
} else {
|
6409
6588
|
GGML_ASSERT(false);
|
6410
6589
|
}
|
6411
6590
|
} else {
|
6412
6591
|
if (src0->type == GGML_TYPE_F32) {
|
6413
|
-
rope_cuda(
|
6592
|
+
rope_cuda(
|
6593
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6594
|
+
attn_factor, corr_dims, main_stream
|
6595
|
+
);
|
6414
6596
|
} else if (src0->type == GGML_TYPE_F16) {
|
6415
|
-
rope_cuda(
|
6597
|
+
rope_cuda(
|
6598
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6599
|
+
attn_factor, corr_dims, main_stream
|
6600
|
+
);
|
6416
6601
|
} else {
|
6417
6602
|
GGML_ASSERT(false);
|
6418
6603
|
}
|
@@ -6523,8 +6708,10 @@ inline void ggml_cuda_op_clamp(
|
|
6523
6708
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6524
6709
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6525
6710
|
|
6526
|
-
|
6527
|
-
|
6711
|
+
float min;
|
6712
|
+
float max;
|
6713
|
+
memcpy(&min, dst->op_params, sizeof(float));
|
6714
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
6528
6715
|
|
6529
6716
|
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6530
6717
|
CUDA_CHECK(cudaGetLastError());
|
@@ -6717,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6717
6904
|
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6718
6905
|
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
6719
6906
|
|
6907
|
+
int used_devices = 0;
|
6908
|
+
|
6720
6909
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
6721
6910
|
// by default, use all rows
|
6722
6911
|
row_low[id] = 0;
|
@@ -6744,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6744
6933
|
continue;
|
6745
6934
|
}
|
6746
6935
|
|
6936
|
+
used_devices++;
|
6937
|
+
|
6747
6938
|
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6748
6939
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6749
6940
|
|
@@ -6782,12 +6973,12 @@ static void ggml_cuda_op_mul_mat(
|
|
6782
6973
|
|
6783
6974
|
// if multiple devices are used they need to wait for the main device
|
6784
6975
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
6785
|
-
if (split &&
|
6976
|
+
if (split && used_devices > 1) {
|
6786
6977
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6787
6978
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6788
6979
|
}
|
6789
6980
|
|
6790
|
-
const int64_t src1_col_stride = split &&
|
6981
|
+
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6791
6982
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6792
6983
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6793
6984
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
@@ -6903,6 +7094,9 @@ static void ggml_cuda_op_mul_mat(
|
|
6903
7094
|
}
|
6904
7095
|
|
6905
7096
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7097
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
7098
|
+
continue;
|
7099
|
+
}
|
6906
7100
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
6907
7101
|
|
6908
7102
|
// free buffers again when done
|
@@ -6927,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(
|
|
6927
7121
|
|
6928
7122
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6929
7123
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7124
|
+
if (row_low[id] == row_high[id]) {
|
7125
|
+
continue;
|
7126
|
+
}
|
6930
7127
|
for (int64_t is = 0; is < is_max; ++is) {
|
6931
7128
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
6932
7129
|
}
|
@@ -6972,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
6972
7169
|
}
|
6973
7170
|
|
6974
7171
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7172
|
+
if (!g_cublas_loaded) return false;
|
7173
|
+
|
6975
7174
|
const int64_t ne10 = src1->ne[0];
|
6976
7175
|
|
6977
7176
|
const int64_t ne0 = dst->ne[0];
|
@@ -7048,9 +7247,34 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7048
7247
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7049
7248
|
}
|
7050
7249
|
|
7051
|
-
|
7250
|
+
__global__ void k_compute_batched_ptrs(
|
7251
|
+
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7252
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
7253
|
+
int ne12, int ne13,
|
7254
|
+
int ne23,
|
7255
|
+
int nb02, int nb03,
|
7256
|
+
int nb12, int nb13,
|
7257
|
+
int nb2, int nb3,
|
7258
|
+
int r2, int r3) {
|
7259
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
7260
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
7261
|
+
|
7262
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
7263
|
+
return;
|
7264
|
+
}
|
7265
|
+
|
7266
|
+
int i03 = i13 / r3;
|
7267
|
+
int i02 = i12 / r2;
|
7268
|
+
|
7269
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7270
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7271
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7272
|
+
}
|
7273
|
+
|
7274
|
+
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7052
7275
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
7053
7276
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
7277
|
+
|
7054
7278
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7055
7279
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
7056
7280
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -7148,49 +7372,45 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7148
7372
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7149
7373
|
} else {
|
7150
7374
|
// use cublasGemmBatchedEx
|
7151
|
-
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
7152
7375
|
const int ne23 = ne12*ne13;
|
7153
7376
|
|
7154
|
-
|
7155
|
-
|
7156
|
-
|
7157
|
-
for (int i13 = 0; i13 < ne13; ++i13) {
|
7158
|
-
for (int i12 = 0; i12 < ne12; ++i12) {
|
7159
|
-
int i03 = i13 / r3;
|
7160
|
-
int i02 = i12 / r2;
|
7161
|
-
|
7162
|
-
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
7163
|
-
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
7164
|
-
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
7165
|
-
}
|
7166
|
-
}
|
7167
|
-
|
7168
|
-
// allocate device memory for pointers
|
7169
|
-
void ** ptrs_as = nullptr;
|
7170
|
-
CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
|
7377
|
+
const void ** ptrs_src = nullptr;
|
7378
|
+
void ** ptrs_dst = nullptr;
|
7171
7379
|
|
7172
|
-
|
7173
|
-
|
7174
|
-
//ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
7380
|
+
size_t ptrs_src_s = 0;
|
7381
|
+
size_t ptrs_dst_s = 0;
|
7175
7382
|
|
7176
|
-
|
7177
|
-
|
7383
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
7384
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
7178
7385
|
|
7179
|
-
|
7386
|
+
dim3 block_dims(ne13, ne12);
|
7387
|
+
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7388
|
+
src0_as_f16, src1_as_f16, dst_f16,
|
7389
|
+
ptrs_src, ptrs_dst,
|
7390
|
+
ne12, ne13,
|
7391
|
+
ne23,
|
7392
|
+
nb02, nb03,
|
7393
|
+
nb12, nb13,
|
7394
|
+
dst->nb[2], dst->nb[3],
|
7395
|
+
r2, r3);
|
7396
|
+
CUDA_CHECK(cudaGetLastError());
|
7180
7397
|
|
7181
7398
|
CUBLAS_CHECK(
|
7182
7399
|
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7183
7400
|
ne01, ne11, ne10,
|
7184
|
-
&alpha_f16, (const void **) (
|
7185
|
-
(const void **) (
|
7186
|
-
&beta_f16, ( void **) (
|
7401
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7402
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7403
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
7187
7404
|
ne23,
|
7188
7405
|
CUBLAS_COMPUTE_16F,
|
7189
7406
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7190
7407
|
|
7191
|
-
|
7192
|
-
|
7193
|
-
|
7408
|
+
if (ptrs_src_s != 0) {
|
7409
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
7410
|
+
}
|
7411
|
+
if (ptrs_dst_s != 0) {
|
7412
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
7413
|
+
}
|
7194
7414
|
}
|
7195
7415
|
#endif
|
7196
7416
|
|
@@ -7202,17 +7422,26 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7202
7422
|
}
|
7203
7423
|
|
7204
7424
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7205
|
-
bool all_on_device =
|
7206
|
-
|
7425
|
+
const bool all_on_device =
|
7426
|
+
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
7427
|
+
(src1->backend == GGML_BACKEND_GPU) &&
|
7428
|
+
( dst->backend == GGML_BACKEND_GPU);
|
7429
|
+
|
7430
|
+
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7207
7431
|
|
7208
7432
|
int64_t min_compute_capability = INT_MAX;
|
7209
7433
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7210
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
7211
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7434
|
+
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7212
7435
|
min_compute_capability = g_compute_capabilities[id];
|
7213
7436
|
}
|
7214
7437
|
}
|
7215
7438
|
|
7439
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
7440
|
+
const bool use_tensor_cores = true;
|
7441
|
+
#else
|
7442
|
+
const bool use_tensor_cores = false;
|
7443
|
+
#endif
|
7444
|
+
|
7216
7445
|
// debug helpers
|
7217
7446
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
7218
7447
|
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
@@ -7221,20 +7450,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7221
7450
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7222
7451
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7223
7452
|
|
7224
|
-
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7453
|
+
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7225
7454
|
// KQ single-batch
|
7226
7455
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7227
|
-
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7456
|
+
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7228
7457
|
// KQV single-batch
|
7229
7458
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7230
|
-
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)
|
7459
|
+
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7231
7460
|
// KQ + KQV multi-batch
|
7232
7461
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7233
7462
|
} else if (src0->type == GGML_TYPE_F32) {
|
7234
7463
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7235
7464
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
7236
7465
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
7237
|
-
|
7238
7466
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7239
7467
|
const bool use_mul_mat_vec_q = false;
|
7240
7468
|
#else
|
@@ -7247,7 +7475,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7247
7475
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
7248
7476
|
}
|
7249
7477
|
} else {
|
7250
|
-
|
7478
|
+
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
7479
|
+
|
7480
|
+
// when tensor cores are available, use them for large batch size
|
7481
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
7482
|
+
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
7483
|
+
use_mul_mat_q = false;
|
7484
|
+
}
|
7485
|
+
|
7486
|
+
if (use_mul_mat_q) {
|
7251
7487
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
7252
7488
|
} else {
|
7253
7489
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
@@ -7601,10 +7837,6 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
7601
7837
|
}
|
7602
7838
|
}
|
7603
7839
|
|
7604
|
-
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
7605
|
-
g_mul_mat_q = mul_mat_q;
|
7606
|
-
}
|
7607
|
-
|
7608
7840
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7609
7841
|
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
7610
7842
|
// it still won't always work as expected, but it's better than nothing
|
@@ -7624,6 +7856,8 @@ void ggml_cuda_free_scratch() {
|
|
7624
7856
|
}
|
7625
7857
|
|
7626
7858
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
7859
|
+
if (!g_cublas_loaded) return false;
|
7860
|
+
|
7627
7861
|
ggml_cuda_func_t func;
|
7628
7862
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7629
7863
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|