llama_cpp 0.9.0 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-alloc.c +12 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +353 -119
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +60 -38
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +1004 -3572
- data/ext/llama_cpp/src/ggml.h +30 -4
- data/ext/llama_cpp/src/llama.cpp +1945 -2648
- data/ext/llama_cpp/src/llama.h +37 -17
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
@@ -87,6 +87,24 @@
|
|
87
87
|
#define CC_OFFSET_AMD 1000000
|
88
88
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
89
89
|
|
90
|
+
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
|
+
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
|
+
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
93
|
+
// - 7B quantum model: +100-200 MB
|
94
|
+
// - 13B quantum model: +200-400 MB
|
95
|
+
//
|
96
|
+
//#define GGML_CUDA_FORCE_MMQ
|
97
|
+
|
98
|
+
// TODO: improve this to be correct for more hardware
|
99
|
+
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
100
|
+
// probably other such cases, and not sure what happens on AMD hardware
|
101
|
+
#if !defined(GGML_CUDA_FORCE_MMQ)
|
102
|
+
#define CUDA_USE_TENSOR_CORES
|
103
|
+
#endif
|
104
|
+
|
105
|
+
// max batch size to use MMQ kernels when tensor cores are available
|
106
|
+
#define MMQ_MAX_BATCH_SIZE 32
|
107
|
+
|
90
108
|
#if defined(GGML_USE_HIPBLAS)
|
91
109
|
#define __CUDA_ARCH__ 1300
|
92
110
|
|
@@ -470,7 +488,6 @@ static int g_device_count = -1;
|
|
470
488
|
static int g_main_device = 0;
|
471
489
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
472
490
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
473
|
-
static bool g_mul_mat_q = true;
|
474
491
|
|
475
492
|
static void * g_scratch_buffer = nullptr;
|
476
493
|
static size_t g_scratch_size = 0; // disabled by default
|
@@ -496,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
|
|
496
513
|
dst[i] = __hadd(x[i], __float2half(y[i]));
|
497
514
|
}
|
498
515
|
|
516
|
+
static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
|
517
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
518
|
+
|
519
|
+
if (i >= k) {
|
520
|
+
return;
|
521
|
+
}
|
522
|
+
dst[i] = __half2float(x[i]) + y[i];
|
523
|
+
}
|
524
|
+
|
499
525
|
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
500
526
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
501
527
|
|
@@ -956,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
956
982
|
|
957
983
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
958
984
|
|
959
|
-
const int row = blockIdx.
|
985
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
960
986
|
if (row > nrows) return;
|
961
987
|
|
962
988
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1060,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
1060
1086
|
|
1061
1087
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1062
1088
|
|
1063
|
-
const int row = blockIdx.
|
1089
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1064
1090
|
if (row > nrows) return;
|
1065
1091
|
|
1066
1092
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -1164,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|
1164
1190
|
|
1165
1191
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1166
1192
|
|
1167
|
-
const int row = blockIdx.
|
1193
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1168
1194
|
if (row > nrows) return;
|
1169
1195
|
const int num_blocks_per_row = ncols / QK_K;
|
1170
1196
|
const int ib0 = row*num_blocks_per_row;
|
@@ -1418,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
1418
1444
|
|
1419
1445
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
1420
1446
|
|
1421
|
-
const int row = blockIdx.
|
1447
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
1422
1448
|
if (row > nrows) return;
|
1423
1449
|
|
1424
1450
|
const int num_blocks_per_row = ncols / QK_K;
|
@@ -3554,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3554
3580
|
#define MMQ_X_Q4_0_RDNA1 64
|
3555
3581
|
#define MMQ_Y_Q4_0_RDNA1 64
|
3556
3582
|
#define NWARPS_Q4_0_RDNA1 8
|
3583
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3584
|
+
#define MMQ_X_Q4_0_AMPERE 4
|
3585
|
+
#define MMQ_Y_Q4_0_AMPERE 32
|
3586
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3587
|
+
#else
|
3557
3588
|
#define MMQ_X_Q4_0_AMPERE 64
|
3558
3589
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3559
3590
|
#define NWARPS_Q4_0_AMPERE 4
|
3591
|
+
#endif
|
3560
3592
|
#define MMQ_X_Q4_0_PASCAL 64
|
3561
3593
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3562
3594
|
#define NWARPS_Q4_0_PASCAL 8
|
@@ -3615,9 +3647,15 @@ template <bool need_check> static __global__ void
|
|
3615
3647
|
#define MMQ_X_Q4_1_RDNA1 64
|
3616
3648
|
#define MMQ_Y_Q4_1_RDNA1 64
|
3617
3649
|
#define NWARPS_Q4_1_RDNA1 8
|
3650
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3651
|
+
#define MMQ_X_Q4_1_AMPERE 4
|
3652
|
+
#define MMQ_Y_Q4_1_AMPERE 32
|
3653
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3654
|
+
#else
|
3618
3655
|
#define MMQ_X_Q4_1_AMPERE 64
|
3619
3656
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3620
3657
|
#define NWARPS_Q4_1_AMPERE 4
|
3658
|
+
#endif
|
3621
3659
|
#define MMQ_X_Q4_1_PASCAL 64
|
3622
3660
|
#define MMQ_Y_Q4_1_PASCAL 64
|
3623
3661
|
#define NWARPS_Q4_1_PASCAL 8
|
@@ -3678,9 +3716,15 @@ template <bool need_check> static __global__ void
|
|
3678
3716
|
#define MMQ_X_Q5_0_RDNA1 64
|
3679
3717
|
#define MMQ_Y_Q5_0_RDNA1 64
|
3680
3718
|
#define NWARPS_Q5_0_RDNA1 8
|
3719
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3720
|
+
#define MMQ_X_Q5_0_AMPERE 4
|
3721
|
+
#define MMQ_Y_Q5_0_AMPERE 32
|
3722
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3723
|
+
#else
|
3681
3724
|
#define MMQ_X_Q5_0_AMPERE 128
|
3682
3725
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3683
3726
|
#define NWARPS_Q5_0_AMPERE 4
|
3727
|
+
#endif
|
3684
3728
|
#define MMQ_X_Q5_0_PASCAL 64
|
3685
3729
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3686
3730
|
#define NWARPS_Q5_0_PASCAL 8
|
@@ -3739,9 +3783,15 @@ template <bool need_check> static __global__ void
|
|
3739
3783
|
#define MMQ_X_Q5_1_RDNA1 64
|
3740
3784
|
#define MMQ_Y_Q5_1_RDNA1 64
|
3741
3785
|
#define NWARPS_Q5_1_RDNA1 8
|
3786
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3787
|
+
#define MMQ_X_Q5_1_AMPERE 4
|
3788
|
+
#define MMQ_Y_Q5_1_AMPERE 32
|
3789
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3790
|
+
#else
|
3742
3791
|
#define MMQ_X_Q5_1_AMPERE 128
|
3743
3792
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3744
3793
|
#define NWARPS_Q5_1_AMPERE 4
|
3794
|
+
#endif
|
3745
3795
|
#define MMQ_X_Q5_1_PASCAL 64
|
3746
3796
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3747
3797
|
#define NWARPS_Q5_1_PASCAL 8
|
@@ -3800,9 +3850,15 @@ mul_mat_q5_1(
|
|
3800
3850
|
#define MMQ_X_Q8_0_RDNA1 64
|
3801
3851
|
#define MMQ_Y_Q8_0_RDNA1 64
|
3802
3852
|
#define NWARPS_Q8_0_RDNA1 8
|
3853
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3854
|
+
#define MMQ_X_Q8_0_AMPERE 4
|
3855
|
+
#define MMQ_Y_Q8_0_AMPERE 32
|
3856
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3857
|
+
#else
|
3803
3858
|
#define MMQ_X_Q8_0_AMPERE 128
|
3804
3859
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3805
3860
|
#define NWARPS_Q8_0_AMPERE 4
|
3861
|
+
#endif
|
3806
3862
|
#define MMQ_X_Q8_0_PASCAL 64
|
3807
3863
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3808
3864
|
#define NWARPS_Q8_0_PASCAL 8
|
@@ -3861,9 +3917,15 @@ template <bool need_check> static __global__ void
|
|
3861
3917
|
#define MMQ_X_Q2_K_RDNA1 128
|
3862
3918
|
#define MMQ_Y_Q2_K_RDNA1 32
|
3863
3919
|
#define NWARPS_Q2_K_RDNA1 8
|
3920
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3921
|
+
#define MMQ_X_Q2_K_AMPERE 4
|
3922
|
+
#define MMQ_Y_Q2_K_AMPERE 32
|
3923
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3924
|
+
#else
|
3864
3925
|
#define MMQ_X_Q2_K_AMPERE 64
|
3865
3926
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3866
3927
|
#define NWARPS_Q2_K_AMPERE 4
|
3928
|
+
#endif
|
3867
3929
|
#define MMQ_X_Q2_K_PASCAL 64
|
3868
3930
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3869
3931
|
#define NWARPS_Q2_K_PASCAL 8
|
@@ -3922,9 +3984,15 @@ mul_mat_q2_K(
|
|
3922
3984
|
#define MMQ_X_Q3_K_RDNA1 32
|
3923
3985
|
#define MMQ_Y_Q3_K_RDNA1 128
|
3924
3986
|
#define NWARPS_Q3_K_RDNA1 8
|
3987
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3988
|
+
#define MMQ_X_Q3_K_AMPERE 4
|
3989
|
+
#define MMQ_Y_Q3_K_AMPERE 32
|
3990
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3991
|
+
#else
|
3925
3992
|
#define MMQ_X_Q3_K_AMPERE 128
|
3926
3993
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3927
3994
|
#define NWARPS_Q3_K_AMPERE 4
|
3995
|
+
#endif
|
3928
3996
|
#define MMQ_X_Q3_K_PASCAL 64
|
3929
3997
|
#define MMQ_Y_Q3_K_PASCAL 64
|
3930
3998
|
#define NWARPS_Q3_K_PASCAL 8
|
@@ -3985,9 +4053,15 @@ template <bool need_check> static __global__ void
|
|
3985
4053
|
#define MMQ_X_Q4_K_RDNA1 32
|
3986
4054
|
#define MMQ_Y_Q4_K_RDNA1 64
|
3987
4055
|
#define NWARPS_Q4_K_RDNA1 8
|
4056
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4057
|
+
#define MMQ_X_Q4_K_AMPERE 4
|
4058
|
+
#define MMQ_Y_Q4_K_AMPERE 32
|
4059
|
+
#define NWARPS_Q4_K_AMPERE 4
|
4060
|
+
#else
|
3988
4061
|
#define MMQ_X_Q4_K_AMPERE 64
|
3989
4062
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3990
4063
|
#define NWARPS_Q4_K_AMPERE 4
|
4064
|
+
#endif
|
3991
4065
|
#define MMQ_X_Q4_K_PASCAL 64
|
3992
4066
|
#define MMQ_Y_Q4_K_PASCAL 64
|
3993
4067
|
#define NWARPS_Q4_K_PASCAL 8
|
@@ -4048,9 +4122,15 @@ template <bool need_check> static __global__ void
|
|
4048
4122
|
#define MMQ_X_Q5_K_RDNA1 32
|
4049
4123
|
#define MMQ_Y_Q5_K_RDNA1 64
|
4050
4124
|
#define NWARPS_Q5_K_RDNA1 8
|
4125
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4126
|
+
#define MMQ_X_Q5_K_AMPERE 4
|
4127
|
+
#define MMQ_Y_Q5_K_AMPERE 32
|
4128
|
+
#define NWARPS_Q5_K_AMPERE 4
|
4129
|
+
#else
|
4051
4130
|
#define MMQ_X_Q5_K_AMPERE 64
|
4052
4131
|
#define MMQ_Y_Q5_K_AMPERE 128
|
4053
4132
|
#define NWARPS_Q5_K_AMPERE 4
|
4133
|
+
#endif
|
4054
4134
|
#define MMQ_X_Q5_K_PASCAL 64
|
4055
4135
|
#define MMQ_Y_Q5_K_PASCAL 64
|
4056
4136
|
#define NWARPS_Q5_K_PASCAL 8
|
@@ -4109,9 +4189,15 @@ mul_mat_q5_K(
|
|
4109
4189
|
#define MMQ_X_Q6_K_RDNA1 32
|
4110
4190
|
#define MMQ_Y_Q6_K_RDNA1 64
|
4111
4191
|
#define NWARPS_Q6_K_RDNA1 8
|
4192
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4193
|
+
#define MMQ_X_Q6_K_AMPERE 4
|
4194
|
+
#define MMQ_Y_Q6_K_AMPERE 32
|
4195
|
+
#define NWARPS_Q6_K_AMPERE 4
|
4196
|
+
#else
|
4112
4197
|
#define MMQ_X_Q6_K_AMPERE 64
|
4113
4198
|
#define MMQ_Y_Q6_K_AMPERE 64
|
4114
4199
|
#define NWARPS_Q6_K_AMPERE 4
|
4200
|
+
#endif
|
4115
4201
|
#define MMQ_X_Q6_K_PASCAL 64
|
4116
4202
|
#define MMQ_Y_Q6_K_PASCAL 64
|
4117
4203
|
#define NWARPS_Q6_K_PASCAL 8
|
@@ -4168,7 +4254,7 @@ template <bool need_check> static __global__ void
|
|
4168
4254
|
|
4169
4255
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
4170
4256
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
4171
|
-
const int row = blockIdx.
|
4257
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4172
4258
|
|
4173
4259
|
if (row >= nrows) {
|
4174
4260
|
return;
|
@@ -4208,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|
4208
4294
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
4209
4295
|
// qk = quantized weights per x block
|
4210
4296
|
// qr = number of quantized weights per data value in x block
|
4211
|
-
const int row = blockIdx.
|
4297
|
+
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
4212
4298
|
|
4213
4299
|
if (row >= nrows) {
|
4214
4300
|
return;
|
@@ -4407,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4407
4493
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4408
4494
|
}
|
4409
4495
|
|
4410
|
-
|
4496
|
+
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4497
|
+
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4498
|
+
return 1.0f - min(1.0f, max(0.0f, y));
|
4499
|
+
}
|
4500
|
+
|
4501
|
+
struct rope_corr_dims {
|
4502
|
+
float v[4];
|
4503
|
+
};
|
4504
|
+
|
4505
|
+
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
4506
|
+
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
4507
|
+
static __device__ void rope_yarn(
|
4508
|
+
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
4509
|
+
float * cos_theta, float * sin_theta
|
4510
|
+
) {
|
4511
|
+
// Get n-d rotational scaling corrected for extrapolation
|
4512
|
+
float theta_interp = freq_scale * theta_extrap;
|
4513
|
+
float theta = theta_interp;
|
4514
|
+
if (ext_factor != 0.0f) {
|
4515
|
+
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
4516
|
+
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
4411
4517
|
|
4518
|
+
// Get n-d magnitude scaling corrected for interpolation
|
4519
|
+
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
4520
|
+
}
|
4521
|
+
*cos_theta = cosf(theta) * mscale;
|
4522
|
+
*sin_theta = sinf(theta) * mscale;
|
4523
|
+
}
|
4524
|
+
|
4525
|
+
// rope == RoPE == rotary positional embedding
|
4412
4526
|
template<typename T, bool has_pos>
|
4413
|
-
static __global__ void rope(
|
4414
|
-
|
4527
|
+
static __global__ void rope(
|
4528
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4529
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4530
|
+
) {
|
4415
4531
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4416
4532
|
|
4417
4533
|
if (col >= ncols) {
|
@@ -4423,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4423
4539
|
const int i2 = row/p_delta_rows;
|
4424
4540
|
|
4425
4541
|
const int p = has_pos ? pos[i2] : 0;
|
4426
|
-
const float
|
4427
|
-
|
4428
|
-
|
4429
|
-
|
4542
|
+
const float theta_base = p*powf(freq_base, -float(col)/ncols);
|
4543
|
+
|
4544
|
+
float cos_theta, sin_theta;
|
4545
|
+
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4430
4546
|
|
4431
4547
|
const float x0 = x[i + 0];
|
4432
4548
|
const float x1 = x[i + 1];
|
@@ -4436,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4436
4552
|
}
|
4437
4553
|
|
4438
4554
|
template<typename T, bool has_pos>
|
4439
|
-
static __global__ void rope_neox(
|
4440
|
-
|
4555
|
+
static __global__ void rope_neox(
|
4556
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4557
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4558
|
+
) {
|
4441
4559
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4442
4560
|
|
4443
4561
|
if (col >= ncols) {
|
@@ -4448,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4448
4566
|
const int i = row*ncols + col/2;
|
4449
4567
|
const int i2 = row/p_delta_rows;
|
4450
4568
|
|
4569
|
+
// simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
|
4570
|
+
const float cur_rot = -float(col)/ncols;
|
4571
|
+
|
4451
4572
|
const int p = has_pos ? pos[i2] : 0;
|
4452
|
-
const float
|
4453
|
-
|
4454
|
-
|
4455
|
-
|
4573
|
+
const float theta_base = p*powf(freq_base, cur_rot);
|
4574
|
+
|
4575
|
+
float cos_theta, sin_theta;
|
4576
|
+
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4456
4577
|
|
4457
4578
|
const float x0 = x[i + 0];
|
4458
4579
|
const float x1 = x[i + ncols/2];
|
@@ -4461,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4461
4582
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4462
4583
|
}
|
4463
4584
|
|
4464
|
-
static __global__ void rope_glm_f32(
|
4465
|
-
|
4585
|
+
static __global__ void rope_glm_f32(
|
4586
|
+
const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4587
|
+
int n_ctx
|
4588
|
+
) {
|
4466
4589
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4467
4590
|
const int half_n_dims = ncols/4;
|
4468
4591
|
|
@@ -4474,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4474
4597
|
const int i = row*ncols + col;
|
4475
4598
|
const int i2 = row/p_delta_rows;
|
4476
4599
|
|
4477
|
-
const float col_theta_scale = powf(
|
4600
|
+
const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
|
4478
4601
|
// FIXME: this is likely wrong
|
4479
4602
|
const int p = pos != nullptr ? pos[i2] : 0;
|
4480
4603
|
|
@@ -4616,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
|
|
4616
4739
|
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4617
4740
|
}
|
4618
4741
|
|
4742
|
+
static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
4743
|
+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4744
|
+
add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4745
|
+
}
|
4746
|
+
|
4619
4747
|
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4620
4748
|
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
4621
4749
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -4739,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
4739
4867
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4740
4868
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4741
4869
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4742
|
-
|
4870
|
+
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
4871
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4743
4872
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4744
4873
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
4745
4874
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4748,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
|
4748
4877
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4749
4878
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4750
4879
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4751
|
-
const dim3 block_nums(
|
4880
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4752
4881
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4753
4882
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
4754
4883
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4757,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
|
4757
4886
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4758
4887
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4759
4888
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4760
|
-
const dim3 block_nums(
|
4889
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4761
4890
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4762
4891
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
4763
4892
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4766,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
|
4766
4895
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4767
4896
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4768
4897
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4769
|
-
const dim3 block_nums(
|
4898
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4770
4899
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4771
4900
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
4772
4901
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4775,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
|
4775
4904
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4776
4905
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4777
4906
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4778
|
-
const dim3 block_nums(
|
4907
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4779
4908
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4780
4909
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
4781
4910
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -4785,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
|
4785
4914
|
GGML_ASSERT(ncols % QK_K == 0);
|
4786
4915
|
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
4787
4916
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4788
|
-
const dim3 block_nums(
|
4917
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4789
4918
|
const dim3 block_dims(32, ny, 1);
|
4790
4919
|
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4791
4920
|
}
|
@@ -4794,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
|
4794
4923
|
GGML_ASSERT(ncols % QK_K == 0);
|
4795
4924
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4796
4925
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4797
|
-
const dim3 block_nums(
|
4926
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4798
4927
|
const dim3 block_dims(32, ny, 1);
|
4799
4928
|
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4800
4929
|
}
|
@@ -4803,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
|
|
4803
4932
|
GGML_ASSERT(ncols % QK_K == 0);
|
4804
4933
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4805
4934
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4806
|
-
const dim3 block_nums(
|
4935
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4807
4936
|
const dim3 block_dims(32, ny, 1);
|
4808
4937
|
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4809
4938
|
}
|
@@ -4818,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4818
4947
|
GGML_ASSERT(ncols % QK_K == 0);
|
4819
4948
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
4820
4949
|
const int block_num_y = (nrows + ny - 1) / ny;
|
4821
|
-
const dim3 block_nums(
|
4950
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4822
4951
|
const dim3 block_dims(32, ny, 1);
|
4823
4952
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4824
4953
|
}
|
@@ -4826,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
4826
4955
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4827
4956
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
4828
4957
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4829
|
-
const dim3 block_nums(
|
4958
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4830
4959
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4831
4960
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
4832
4961
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4835,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4835
4964
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4836
4965
|
GGML_ASSERT(ncols % QK4_1 == 0);
|
4837
4966
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4838
|
-
const dim3 block_nums(
|
4967
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4839
4968
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4840
4969
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
4841
4970
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4844,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4844
4973
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4845
4974
|
GGML_ASSERT(ncols % QK5_0 == 0);
|
4846
4975
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4847
|
-
const dim3 block_nums(
|
4976
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4848
4977
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4849
4978
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
4850
4979
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4853,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4853
4982
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4854
4983
|
GGML_ASSERT(ncols % QK5_1 == 0);
|
4855
4984
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4856
|
-
const dim3 block_nums(
|
4985
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4857
4986
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4858
4987
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
4859
4988
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4862,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4862
4991
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4863
4992
|
GGML_ASSERT(ncols % QK8_0 == 0);
|
4864
4993
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4865
|
-
const dim3 block_nums(
|
4994
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4866
4995
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4867
4996
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
4868
4997
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4871,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4871
5000
|
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4872
5001
|
GGML_ASSERT(ncols % QK_K == 0);
|
4873
5002
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4874
|
-
const dim3 block_nums(
|
5003
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4875
5004
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4876
5005
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
4877
5006
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4880,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4880
5009
|
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4881
5010
|
GGML_ASSERT(ncols % QK_K == 0);
|
4882
5011
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4883
|
-
const dim3 block_nums(
|
5012
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4884
5013
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4885
5014
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
4886
5015
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4889,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4889
5018
|
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4890
5019
|
GGML_ASSERT(ncols % QK_K == 0);
|
4891
5020
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4892
|
-
const dim3 block_nums(
|
5021
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4893
5022
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4894
5023
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
4895
5024
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4898,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4898
5027
|
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4899
5028
|
GGML_ASSERT(ncols % QK_K == 0);
|
4900
5029
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4901
|
-
const dim3 block_nums(
|
5030
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4902
5031
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4903
5032
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
4904
5033
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4907,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
4907
5036
|
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4908
5037
|
GGML_ASSERT(ncols % QK_K == 0);
|
4909
5038
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4910
|
-
const dim3 block_nums(
|
5039
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4911
5040
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4912
5041
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
4913
5042
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
@@ -4926,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
|
|
4926
5055
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4927
5056
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4928
5057
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
4929
|
-
const dim3 block_nums(
|
5058
|
+
const dim3 block_nums(block_num_y, 1, 1);
|
4930
5059
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
4931
5060
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
4932
5061
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
@@ -5493,40 +5622,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
|
|
5493
5622
|
}
|
5494
5623
|
|
5495
5624
|
template<typename T>
|
5496
|
-
static void rope_cuda(
|
5497
|
-
|
5625
|
+
static void rope_cuda(
|
5626
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5627
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5628
|
+
) {
|
5498
5629
|
GGML_ASSERT(ncols % 2 == 0);
|
5499
5630
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5500
5631
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5501
5632
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5502
5633
|
if (pos == nullptr) {
|
5503
|
-
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5634
|
+
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5635
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5636
|
+
);
|
5504
5637
|
} else {
|
5505
|
-
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5638
|
+
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5639
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5640
|
+
);
|
5506
5641
|
}
|
5507
5642
|
}
|
5508
5643
|
|
5509
5644
|
template<typename T>
|
5510
|
-
static void rope_neox_cuda(
|
5511
|
-
|
5645
|
+
static void rope_neox_cuda(
|
5646
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5647
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5648
|
+
) {
|
5512
5649
|
GGML_ASSERT(ncols % 2 == 0);
|
5513
5650
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5514
5651
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5515
5652
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5516
5653
|
if (pos == nullptr) {
|
5517
|
-
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5654
|
+
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5655
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5656
|
+
);
|
5518
5657
|
} else {
|
5519
|
-
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5658
|
+
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5659
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5660
|
+
);
|
5520
5661
|
}
|
5521
5662
|
}
|
5522
5663
|
|
5523
|
-
static void rope_glm_f32_cuda(
|
5524
|
-
|
5664
|
+
static void rope_glm_f32_cuda(
|
5665
|
+
const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5666
|
+
float freq_base, int n_ctx, cudaStream_t stream
|
5667
|
+
) {
|
5525
5668
|
GGML_ASSERT(ncols % 4 == 0);
|
5526
5669
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5527
5670
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
5528
5671
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
5529
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5672
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
|
5530
5673
|
}
|
5531
5674
|
|
5532
5675
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5647,6 +5790,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
5647
5790
|
CUDA_CHECK(cudaFree(ptr));
|
5648
5791
|
}
|
5649
5792
|
|
5793
|
+
static bool g_cublas_loaded = false;
|
5794
|
+
|
5795
|
+
bool ggml_cublas_loaded(void) {
|
5796
|
+
return g_cublas_loaded;
|
5797
|
+
}
|
5650
5798
|
|
5651
5799
|
void ggml_init_cublas() {
|
5652
5800
|
static bool initialized = false;
|
@@ -5660,9 +5808,24 @@ void ggml_init_cublas() {
|
|
5660
5808
|
CUDA_CHECK(cudaDeviceSynchronize());
|
5661
5809
|
#endif
|
5662
5810
|
|
5663
|
-
|
5811
|
+
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
5812
|
+
initialized = true;
|
5813
|
+
g_cublas_loaded = false;
|
5814
|
+
return;
|
5815
|
+
}
|
5816
|
+
|
5664
5817
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5665
5818
|
int64_t total_vram = 0;
|
5819
|
+
#if defined(GGML_CUDA_FORCE_MMQ)
|
5820
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
5821
|
+
#else
|
5822
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
5823
|
+
#endif
|
5824
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
5825
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
5826
|
+
#else
|
5827
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
5828
|
+
#endif
|
5666
5829
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5667
5830
|
for (int id = 0; id < g_device_count; ++id) {
|
5668
5831
|
cudaDeviceProp prop;
|
@@ -5698,6 +5861,7 @@ void ggml_init_cublas() {
|
|
5698
5861
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
5699
5862
|
|
5700
5863
|
initialized = true;
|
5864
|
+
g_cublas_loaded = true;
|
5701
5865
|
}
|
5702
5866
|
}
|
5703
5867
|
|
@@ -5909,7 +6073,10 @@ inline void ggml_cuda_op_add(
|
|
5909
6073
|
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5910
6074
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5911
6075
|
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
6076
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6077
|
+
add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5912
6078
|
} else {
|
6079
|
+
fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
|
5913
6080
|
GGML_ASSERT(false);
|
5914
6081
|
}
|
5915
6082
|
|
@@ -6347,7 +6514,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6347
6514
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6348
6515
|
row_diff, src1_ncols, ne10,
|
6349
6516
|
&alpha, src0_ddf_i, ne00,
|
6350
|
-
src1_ddf_i,
|
6517
|
+
src1_ddf_i, ne10,
|
6351
6518
|
&beta, dst_dd_i, ldc));
|
6352
6519
|
|
6353
6520
|
if (src0_as != 0) {
|
@@ -6373,17 +6540,20 @@ inline void ggml_cuda_op_rope(
|
|
6373
6540
|
const int64_t ne2 = dst->ne[2];
|
6374
6541
|
const int64_t nrows = ggml_nrows(src0);
|
6375
6542
|
|
6376
|
-
//const int n_past
|
6377
|
-
const int n_dims
|
6378
|
-
const int mode
|
6379
|
-
const int n_ctx
|
6380
|
-
|
6381
|
-
|
6382
|
-
float freq_base, freq_scale;
|
6383
|
-
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
6384
|
-
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
6543
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6544
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
6545
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
6546
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
6547
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
6385
6548
|
|
6386
|
-
|
6549
|
+
// RoPE alteration for extended context
|
6550
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
6551
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
6552
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
6553
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
6554
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
6555
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
6556
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
6387
6557
|
|
6388
6558
|
const int32_t * pos = nullptr;
|
6389
6559
|
if ((mode & 1) == 0) {
|
@@ -6395,24 +6565,39 @@ inline void ggml_cuda_op_rope(
|
|
6395
6565
|
const bool is_neox = mode & 2;
|
6396
6566
|
const bool is_glm = mode & 4;
|
6397
6567
|
|
6568
|
+
rope_corr_dims corr_dims;
|
6569
|
+
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
6570
|
+
|
6398
6571
|
// compute
|
6399
6572
|
if (is_glm) {
|
6400
6573
|
GGML_ASSERT(false);
|
6401
|
-
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01,
|
6574
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6402
6575
|
} else if (is_neox) {
|
6403
6576
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6404
6577
|
if (src0->type == GGML_TYPE_F32) {
|
6405
|
-
rope_neox_cuda(
|
6578
|
+
rope_neox_cuda(
|
6579
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6580
|
+
attn_factor, corr_dims, main_stream
|
6581
|
+
);
|
6406
6582
|
} else if (src0->type == GGML_TYPE_F16) {
|
6407
|
-
rope_neox_cuda(
|
6583
|
+
rope_neox_cuda(
|
6584
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6585
|
+
attn_factor, corr_dims, main_stream
|
6586
|
+
);
|
6408
6587
|
} else {
|
6409
6588
|
GGML_ASSERT(false);
|
6410
6589
|
}
|
6411
6590
|
} else {
|
6412
6591
|
if (src0->type == GGML_TYPE_F32) {
|
6413
|
-
rope_cuda(
|
6592
|
+
rope_cuda(
|
6593
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6594
|
+
attn_factor, corr_dims, main_stream
|
6595
|
+
);
|
6414
6596
|
} else if (src0->type == GGML_TYPE_F16) {
|
6415
|
-
rope_cuda(
|
6597
|
+
rope_cuda(
|
6598
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6599
|
+
attn_factor, corr_dims, main_stream
|
6600
|
+
);
|
6416
6601
|
} else {
|
6417
6602
|
GGML_ASSERT(false);
|
6418
6603
|
}
|
@@ -6523,8 +6708,10 @@ inline void ggml_cuda_op_clamp(
|
|
6523
6708
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6524
6709
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6525
6710
|
|
6526
|
-
|
6527
|
-
|
6711
|
+
float min;
|
6712
|
+
float max;
|
6713
|
+
memcpy(&min, dst->op_params, sizeof(float));
|
6714
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
6528
6715
|
|
6529
6716
|
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6530
6717
|
CUDA_CHECK(cudaGetLastError());
|
@@ -6717,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6717
6904
|
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6718
6905
|
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
6719
6906
|
|
6907
|
+
int used_devices = 0;
|
6908
|
+
|
6720
6909
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
6721
6910
|
// by default, use all rows
|
6722
6911
|
row_low[id] = 0;
|
@@ -6744,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6744
6933
|
continue;
|
6745
6934
|
}
|
6746
6935
|
|
6936
|
+
used_devices++;
|
6937
|
+
|
6747
6938
|
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6748
6939
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6749
6940
|
|
@@ -6782,12 +6973,12 @@ static void ggml_cuda_op_mul_mat(
|
|
6782
6973
|
|
6783
6974
|
// if multiple devices are used they need to wait for the main device
|
6784
6975
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
6785
|
-
if (split &&
|
6976
|
+
if (split && used_devices > 1) {
|
6786
6977
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6787
6978
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6788
6979
|
}
|
6789
6980
|
|
6790
|
-
const int64_t src1_col_stride = split &&
|
6981
|
+
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6791
6982
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6792
6983
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6793
6984
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
@@ -6903,6 +7094,9 @@ static void ggml_cuda_op_mul_mat(
|
|
6903
7094
|
}
|
6904
7095
|
|
6905
7096
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7097
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
7098
|
+
continue;
|
7099
|
+
}
|
6906
7100
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
6907
7101
|
|
6908
7102
|
// free buffers again when done
|
@@ -6927,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(
|
|
6927
7121
|
|
6928
7122
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6929
7123
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7124
|
+
if (row_low[id] == row_high[id]) {
|
7125
|
+
continue;
|
7126
|
+
}
|
6930
7127
|
for (int64_t is = 0; is < is_max; ++is) {
|
6931
7128
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
6932
7129
|
}
|
@@ -6972,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
6972
7169
|
}
|
6973
7170
|
|
6974
7171
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7172
|
+
if (!g_cublas_loaded) return false;
|
7173
|
+
|
6975
7174
|
const int64_t ne10 = src1->ne[0];
|
6976
7175
|
|
6977
7176
|
const int64_t ne0 = dst->ne[0];
|
@@ -7048,9 +7247,34 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7048
7247
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7049
7248
|
}
|
7050
7249
|
|
7051
|
-
|
7250
|
+
__global__ void k_compute_batched_ptrs(
|
7251
|
+
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7252
|
+
const void ** ptrs_src, void ** ptrs_dst,
|
7253
|
+
int ne12, int ne13,
|
7254
|
+
int ne23,
|
7255
|
+
int nb02, int nb03,
|
7256
|
+
int nb12, int nb13,
|
7257
|
+
int nb2, int nb3,
|
7258
|
+
int r2, int r3) {
|
7259
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
7260
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
7261
|
+
|
7262
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
7263
|
+
return;
|
7264
|
+
}
|
7265
|
+
|
7266
|
+
int i03 = i13 / r3;
|
7267
|
+
int i02 = i12 / r2;
|
7268
|
+
|
7269
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7270
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7271
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7272
|
+
}
|
7273
|
+
|
7274
|
+
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7052
7275
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
7053
7276
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
7277
|
+
|
7054
7278
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7055
7279
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
7056
7280
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -7148,49 +7372,45 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7148
7372
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7149
7373
|
} else {
|
7150
7374
|
// use cublasGemmBatchedEx
|
7151
|
-
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
7152
7375
|
const int ne23 = ne12*ne13;
|
7153
7376
|
|
7154
|
-
|
7155
|
-
|
7156
|
-
|
7157
|
-
for (int i13 = 0; i13 < ne13; ++i13) {
|
7158
|
-
for (int i12 = 0; i12 < ne12; ++i12) {
|
7159
|
-
int i03 = i13 / r3;
|
7160
|
-
int i02 = i12 / r2;
|
7161
|
-
|
7162
|
-
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
7163
|
-
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
7164
|
-
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
7165
|
-
}
|
7166
|
-
}
|
7167
|
-
|
7168
|
-
// allocate device memory for pointers
|
7169
|
-
void ** ptrs_as = nullptr;
|
7170
|
-
CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
|
7377
|
+
const void ** ptrs_src = nullptr;
|
7378
|
+
void ** ptrs_dst = nullptr;
|
7171
7379
|
|
7172
|
-
|
7173
|
-
|
7174
|
-
//ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
7380
|
+
size_t ptrs_src_s = 0;
|
7381
|
+
size_t ptrs_dst_s = 0;
|
7175
7382
|
|
7176
|
-
|
7177
|
-
|
7383
|
+
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
7384
|
+
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
7178
7385
|
|
7179
|
-
|
7386
|
+
dim3 block_dims(ne13, ne12);
|
7387
|
+
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7388
|
+
src0_as_f16, src1_as_f16, dst_f16,
|
7389
|
+
ptrs_src, ptrs_dst,
|
7390
|
+
ne12, ne13,
|
7391
|
+
ne23,
|
7392
|
+
nb02, nb03,
|
7393
|
+
nb12, nb13,
|
7394
|
+
dst->nb[2], dst->nb[3],
|
7395
|
+
r2, r3);
|
7396
|
+
CUDA_CHECK(cudaGetLastError());
|
7180
7397
|
|
7181
7398
|
CUBLAS_CHECK(
|
7182
7399
|
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7183
7400
|
ne01, ne11, ne10,
|
7184
|
-
&alpha_f16, (const void **) (
|
7185
|
-
(const void **) (
|
7186
|
-
&beta_f16, ( void **) (
|
7401
|
+
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7402
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7403
|
+
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
7187
7404
|
ne23,
|
7188
7405
|
CUBLAS_COMPUTE_16F,
|
7189
7406
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7190
7407
|
|
7191
|
-
|
7192
|
-
|
7193
|
-
|
7408
|
+
if (ptrs_src_s != 0) {
|
7409
|
+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
7410
|
+
}
|
7411
|
+
if (ptrs_dst_s != 0) {
|
7412
|
+
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
7413
|
+
}
|
7194
7414
|
}
|
7195
7415
|
#endif
|
7196
7416
|
|
@@ -7202,17 +7422,26 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
7202
7422
|
}
|
7203
7423
|
|
7204
7424
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7205
|
-
bool all_on_device =
|
7206
|
-
|
7425
|
+
const bool all_on_device =
|
7426
|
+
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
7427
|
+
(src1->backend == GGML_BACKEND_GPU) &&
|
7428
|
+
( dst->backend == GGML_BACKEND_GPU);
|
7429
|
+
|
7430
|
+
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
7207
7431
|
|
7208
7432
|
int64_t min_compute_capability = INT_MAX;
|
7209
7433
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7210
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
7211
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7434
|
+
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7212
7435
|
min_compute_capability = g_compute_capabilities[id];
|
7213
7436
|
}
|
7214
7437
|
}
|
7215
7438
|
|
7439
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
7440
|
+
const bool use_tensor_cores = true;
|
7441
|
+
#else
|
7442
|
+
const bool use_tensor_cores = false;
|
7443
|
+
#endif
|
7444
|
+
|
7216
7445
|
// debug helpers
|
7217
7446
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
7218
7447
|
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
@@ -7221,20 +7450,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7221
7450
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7222
7451
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7223
7452
|
|
7224
|
-
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7453
|
+
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7225
7454
|
// KQ single-batch
|
7226
7455
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7227
|
-
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7456
|
+
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7228
7457
|
// KQV single-batch
|
7229
7458
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7230
|
-
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)
|
7459
|
+
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7231
7460
|
// KQ + KQV multi-batch
|
7232
7461
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7233
7462
|
} else if (src0->type == GGML_TYPE_F32) {
|
7234
7463
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7235
7464
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
7236
7465
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
7237
|
-
|
7238
7466
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7239
7467
|
const bool use_mul_mat_vec_q = false;
|
7240
7468
|
#else
|
@@ -7247,7 +7475,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7247
7475
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
7248
7476
|
}
|
7249
7477
|
} else {
|
7250
|
-
|
7478
|
+
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
7479
|
+
|
7480
|
+
// when tensor cores are available, use them for large batch size
|
7481
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
7482
|
+
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
7483
|
+
use_mul_mat_q = false;
|
7484
|
+
}
|
7485
|
+
|
7486
|
+
if (use_mul_mat_q) {
|
7251
7487
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
7252
7488
|
} else {
|
7253
7489
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
@@ -7601,10 +7837,6 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
7601
7837
|
}
|
7602
7838
|
}
|
7603
7839
|
|
7604
|
-
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
7605
|
-
g_mul_mat_q = mul_mat_q;
|
7606
|
-
}
|
7607
|
-
|
7608
7840
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7609
7841
|
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
7610
7842
|
// it still won't always work as expected, but it's better than nothing
|
@@ -7624,6 +7856,8 @@ void ggml_cuda_free_scratch() {
|
|
7624
7856
|
}
|
7625
7857
|
|
7626
7858
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
7859
|
+
if (!g_cublas_loaded) return false;
|
7860
|
+
|
7627
7861
|
ggml_cuda_func_t func;
|
7628
7862
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7629
7863
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|