llama_cpp 0.8.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +228 -165
- data/ext/llama_cpp/src/ggml-cuda.cu +441 -77
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +71 -42
- data/ext/llama_cpp/src/ggml-metal.metal +171 -35
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +1303 -3419
- data/ext/llama_cpp/src/ggml.h +33 -11
- data/ext/llama_cpp/src/llama.cpp +1925 -2655
- data/ext/llama_cpp/src/llama.h +48 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +34 -14
- metadata +5 -4
@@ -29,6 +29,8 @@
|
|
29
29
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
30
30
|
#define cublasCreate hipblasCreate
|
31
31
|
#define cublasGemmEx hipblasGemmEx
|
32
|
+
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
33
|
+
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
32
34
|
#define cublasHandle_t hipblasHandle_t
|
33
35
|
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
34
36
|
#define cublasSetStream hipblasSetStream
|
@@ -85,6 +87,24 @@
|
|
85
87
|
#define CC_OFFSET_AMD 1000000
|
86
88
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
87
89
|
|
90
|
+
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
|
+
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
|
+
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
93
|
+
// - 7B quantum model: +100-200 MB
|
94
|
+
// - 13B quantum model: +200-400 MB
|
95
|
+
//
|
96
|
+
//#define GGML_CUDA_FORCE_MMQ
|
97
|
+
|
98
|
+
// TODO: improve this to be correct for more hardware
|
99
|
+
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
100
|
+
// probably other such cases, and not sure what happens on AMD hardware
|
101
|
+
#if !defined(GGML_CUDA_FORCE_MMQ)
|
102
|
+
#define CUDA_USE_TENSOR_CORES
|
103
|
+
#endif
|
104
|
+
|
105
|
+
// max batch size to use MMQ kernels when tensor cores are available
|
106
|
+
#define MMQ_MAX_BATCH_SIZE 32
|
107
|
+
|
88
108
|
#if defined(GGML_USE_HIPBLAS)
|
89
109
|
#define __CUDA_ARCH__ 1300
|
90
110
|
|
@@ -468,7 +488,6 @@ static int g_device_count = -1;
|
|
468
488
|
static int g_main_device = 0;
|
469
489
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
470
490
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
471
|
-
static bool g_mul_mat_q = true;
|
472
491
|
|
473
492
|
static void * g_scratch_buffer = nullptr;
|
474
493
|
static size_t g_scratch_size = 0; // disabled by default
|
@@ -494,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
|
|
494
513
|
dst[i] = __hadd(x[i], __float2half(y[i]));
|
495
514
|
}
|
496
515
|
|
516
|
+
static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
|
517
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
518
|
+
|
519
|
+
if (i >= k) {
|
520
|
+
return;
|
521
|
+
}
|
522
|
+
dst[i] = __half2float(x[i]) + y[i];
|
523
|
+
}
|
524
|
+
|
497
525
|
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
498
526
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
499
527
|
|
@@ -3552,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3552
3580
|
#define MMQ_X_Q4_0_RDNA1 64
|
3553
3581
|
#define MMQ_Y_Q4_0_RDNA1 64
|
3554
3582
|
#define NWARPS_Q4_0_RDNA1 8
|
3583
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3584
|
+
#define MMQ_X_Q4_0_AMPERE 4
|
3585
|
+
#define MMQ_Y_Q4_0_AMPERE 32
|
3586
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3587
|
+
#else
|
3555
3588
|
#define MMQ_X_Q4_0_AMPERE 64
|
3556
3589
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3557
3590
|
#define NWARPS_Q4_0_AMPERE 4
|
3591
|
+
#endif
|
3558
3592
|
#define MMQ_X_Q4_0_PASCAL 64
|
3559
3593
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3560
3594
|
#define NWARPS_Q4_0_PASCAL 8
|
@@ -3613,9 +3647,15 @@ template <bool need_check> static __global__ void
|
|
3613
3647
|
#define MMQ_X_Q4_1_RDNA1 64
|
3614
3648
|
#define MMQ_Y_Q4_1_RDNA1 64
|
3615
3649
|
#define NWARPS_Q4_1_RDNA1 8
|
3650
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3651
|
+
#define MMQ_X_Q4_1_AMPERE 4
|
3652
|
+
#define MMQ_Y_Q4_1_AMPERE 32
|
3653
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3654
|
+
#else
|
3616
3655
|
#define MMQ_X_Q4_1_AMPERE 64
|
3617
3656
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3618
3657
|
#define NWARPS_Q4_1_AMPERE 4
|
3658
|
+
#endif
|
3619
3659
|
#define MMQ_X_Q4_1_PASCAL 64
|
3620
3660
|
#define MMQ_Y_Q4_1_PASCAL 64
|
3621
3661
|
#define NWARPS_Q4_1_PASCAL 8
|
@@ -3676,9 +3716,15 @@ template <bool need_check> static __global__ void
|
|
3676
3716
|
#define MMQ_X_Q5_0_RDNA1 64
|
3677
3717
|
#define MMQ_Y_Q5_0_RDNA1 64
|
3678
3718
|
#define NWARPS_Q5_0_RDNA1 8
|
3719
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3720
|
+
#define MMQ_X_Q5_0_AMPERE 4
|
3721
|
+
#define MMQ_Y_Q5_0_AMPERE 32
|
3722
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3723
|
+
#else
|
3679
3724
|
#define MMQ_X_Q5_0_AMPERE 128
|
3680
3725
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3681
3726
|
#define NWARPS_Q5_0_AMPERE 4
|
3727
|
+
#endif
|
3682
3728
|
#define MMQ_X_Q5_0_PASCAL 64
|
3683
3729
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3684
3730
|
#define NWARPS_Q5_0_PASCAL 8
|
@@ -3737,9 +3783,15 @@ template <bool need_check> static __global__ void
|
|
3737
3783
|
#define MMQ_X_Q5_1_RDNA1 64
|
3738
3784
|
#define MMQ_Y_Q5_1_RDNA1 64
|
3739
3785
|
#define NWARPS_Q5_1_RDNA1 8
|
3786
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3787
|
+
#define MMQ_X_Q5_1_AMPERE 4
|
3788
|
+
#define MMQ_Y_Q5_1_AMPERE 32
|
3789
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3790
|
+
#else
|
3740
3791
|
#define MMQ_X_Q5_1_AMPERE 128
|
3741
3792
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3742
3793
|
#define NWARPS_Q5_1_AMPERE 4
|
3794
|
+
#endif
|
3743
3795
|
#define MMQ_X_Q5_1_PASCAL 64
|
3744
3796
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3745
3797
|
#define NWARPS_Q5_1_PASCAL 8
|
@@ -3798,9 +3850,15 @@ mul_mat_q5_1(
|
|
3798
3850
|
#define MMQ_X_Q8_0_RDNA1 64
|
3799
3851
|
#define MMQ_Y_Q8_0_RDNA1 64
|
3800
3852
|
#define NWARPS_Q8_0_RDNA1 8
|
3853
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3854
|
+
#define MMQ_X_Q8_0_AMPERE 4
|
3855
|
+
#define MMQ_Y_Q8_0_AMPERE 32
|
3856
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3857
|
+
#else
|
3801
3858
|
#define MMQ_X_Q8_0_AMPERE 128
|
3802
3859
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3803
3860
|
#define NWARPS_Q8_0_AMPERE 4
|
3861
|
+
#endif
|
3804
3862
|
#define MMQ_X_Q8_0_PASCAL 64
|
3805
3863
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3806
3864
|
#define NWARPS_Q8_0_PASCAL 8
|
@@ -3859,9 +3917,15 @@ template <bool need_check> static __global__ void
|
|
3859
3917
|
#define MMQ_X_Q2_K_RDNA1 128
|
3860
3918
|
#define MMQ_Y_Q2_K_RDNA1 32
|
3861
3919
|
#define NWARPS_Q2_K_RDNA1 8
|
3920
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3921
|
+
#define MMQ_X_Q2_K_AMPERE 4
|
3922
|
+
#define MMQ_Y_Q2_K_AMPERE 32
|
3923
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3924
|
+
#else
|
3862
3925
|
#define MMQ_X_Q2_K_AMPERE 64
|
3863
3926
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3864
3927
|
#define NWARPS_Q2_K_AMPERE 4
|
3928
|
+
#endif
|
3865
3929
|
#define MMQ_X_Q2_K_PASCAL 64
|
3866
3930
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3867
3931
|
#define NWARPS_Q2_K_PASCAL 8
|
@@ -3920,9 +3984,15 @@ mul_mat_q2_K(
|
|
3920
3984
|
#define MMQ_X_Q3_K_RDNA1 32
|
3921
3985
|
#define MMQ_Y_Q3_K_RDNA1 128
|
3922
3986
|
#define NWARPS_Q3_K_RDNA1 8
|
3987
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3988
|
+
#define MMQ_X_Q3_K_AMPERE 4
|
3989
|
+
#define MMQ_Y_Q3_K_AMPERE 32
|
3990
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3991
|
+
#else
|
3923
3992
|
#define MMQ_X_Q3_K_AMPERE 128
|
3924
3993
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3925
3994
|
#define NWARPS_Q3_K_AMPERE 4
|
3995
|
+
#endif
|
3926
3996
|
#define MMQ_X_Q3_K_PASCAL 64
|
3927
3997
|
#define MMQ_Y_Q3_K_PASCAL 64
|
3928
3998
|
#define NWARPS_Q3_K_PASCAL 8
|
@@ -3983,9 +4053,15 @@ template <bool need_check> static __global__ void
|
|
3983
4053
|
#define MMQ_X_Q4_K_RDNA1 32
|
3984
4054
|
#define MMQ_Y_Q4_K_RDNA1 64
|
3985
4055
|
#define NWARPS_Q4_K_RDNA1 8
|
4056
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4057
|
+
#define MMQ_X_Q4_K_AMPERE 4
|
4058
|
+
#define MMQ_Y_Q4_K_AMPERE 32
|
4059
|
+
#define NWARPS_Q4_K_AMPERE 4
|
4060
|
+
#else
|
3986
4061
|
#define MMQ_X_Q4_K_AMPERE 64
|
3987
4062
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3988
4063
|
#define NWARPS_Q4_K_AMPERE 4
|
4064
|
+
#endif
|
3989
4065
|
#define MMQ_X_Q4_K_PASCAL 64
|
3990
4066
|
#define MMQ_Y_Q4_K_PASCAL 64
|
3991
4067
|
#define NWARPS_Q4_K_PASCAL 8
|
@@ -4046,9 +4122,15 @@ template <bool need_check> static __global__ void
|
|
4046
4122
|
#define MMQ_X_Q5_K_RDNA1 32
|
4047
4123
|
#define MMQ_Y_Q5_K_RDNA1 64
|
4048
4124
|
#define NWARPS_Q5_K_RDNA1 8
|
4125
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4126
|
+
#define MMQ_X_Q5_K_AMPERE 4
|
4127
|
+
#define MMQ_Y_Q5_K_AMPERE 32
|
4128
|
+
#define NWARPS_Q5_K_AMPERE 4
|
4129
|
+
#else
|
4049
4130
|
#define MMQ_X_Q5_K_AMPERE 64
|
4050
4131
|
#define MMQ_Y_Q5_K_AMPERE 128
|
4051
4132
|
#define NWARPS_Q5_K_AMPERE 4
|
4133
|
+
#endif
|
4052
4134
|
#define MMQ_X_Q5_K_PASCAL 64
|
4053
4135
|
#define MMQ_Y_Q5_K_PASCAL 64
|
4054
4136
|
#define NWARPS_Q5_K_PASCAL 8
|
@@ -4107,9 +4189,15 @@ mul_mat_q5_K(
|
|
4107
4189
|
#define MMQ_X_Q6_K_RDNA1 32
|
4108
4190
|
#define MMQ_Y_Q6_K_RDNA1 64
|
4109
4191
|
#define NWARPS_Q6_K_RDNA1 8
|
4192
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4193
|
+
#define MMQ_X_Q6_K_AMPERE 4
|
4194
|
+
#define MMQ_Y_Q6_K_AMPERE 32
|
4195
|
+
#define NWARPS_Q6_K_AMPERE 4
|
4196
|
+
#else
|
4110
4197
|
#define MMQ_X_Q6_K_AMPERE 64
|
4111
4198
|
#define MMQ_Y_Q6_K_AMPERE 64
|
4112
4199
|
#define NWARPS_Q6_K_AMPERE 4
|
4200
|
+
#endif
|
4113
4201
|
#define MMQ_X_Q6_K_PASCAL 64
|
4114
4202
|
#define MMQ_Y_Q6_K_PASCAL 64
|
4115
4203
|
#define NWARPS_Q6_K_PASCAL 8
|
@@ -4326,13 +4414,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
4326
4414
|
|
4327
4415
|
const half * x = (const half *) vx;
|
4328
4416
|
|
4329
|
-
const int row_x
|
4330
|
-
const int channel
|
4417
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
4418
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
4331
4419
|
const int channel_x = channel / channel_x_divisor;
|
4332
4420
|
|
4333
|
-
const int nrows_y
|
4421
|
+
const int nrows_y = ncols_x;
|
4334
4422
|
const int nrows_dst = nrows_x;
|
4335
|
-
const int row_dst
|
4423
|
+
const int row_dst = row_x;
|
4336
4424
|
|
4337
4425
|
const int idst = channel*nrows_dst + row_dst;
|
4338
4426
|
|
@@ -4345,13 +4433,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
4345
4433
|
break;
|
4346
4434
|
}
|
4347
4435
|
|
4348
|
-
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
4349
|
-
const float xi = __half2float(x[ix]);
|
4350
|
-
|
4351
4436
|
const int row_y = col_x;
|
4352
4437
|
|
4438
|
+
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
4353
4439
|
const int iy = channel*nrows_y + row_y;
|
4354
4440
|
|
4441
|
+
const float xi = __half2float(x[ix]);
|
4442
|
+
|
4355
4443
|
tmp += xi * y[iy];
|
4356
4444
|
}
|
4357
4445
|
|
@@ -4405,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4405
4493
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4406
4494
|
}
|
4407
4495
|
|
4408
|
-
|
4496
|
+
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4497
|
+
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4498
|
+
return 1.0f - min(1.0f, max(0.0f, y));
|
4499
|
+
}
|
4500
|
+
|
4501
|
+
struct rope_corr_dims {
|
4502
|
+
float v[4];
|
4503
|
+
};
|
4504
|
+
|
4505
|
+
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
4506
|
+
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
4507
|
+
static __device__ void rope_yarn(
|
4508
|
+
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
4509
|
+
float * cos_theta, float * sin_theta
|
4510
|
+
) {
|
4511
|
+
// Get n-d rotational scaling corrected for extrapolation
|
4512
|
+
float theta_interp = freq_scale * theta_extrap;
|
4513
|
+
float theta = theta_interp;
|
4514
|
+
if (ext_factor != 0.0f) {
|
4515
|
+
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
4516
|
+
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
4517
|
+
|
4518
|
+
// Get n-d magnitude scaling corrected for interpolation
|
4519
|
+
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
4520
|
+
}
|
4521
|
+
*cos_theta = cosf(theta) * mscale;
|
4522
|
+
*sin_theta = sinf(theta) * mscale;
|
4523
|
+
}
|
4409
4524
|
|
4525
|
+
// rope == RoPE == rotary positional embedding
|
4410
4526
|
template<typename T, bool has_pos>
|
4411
|
-
static __global__ void rope(
|
4412
|
-
|
4527
|
+
static __global__ void rope(
|
4528
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4529
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4530
|
+
) {
|
4413
4531
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4414
4532
|
|
4415
4533
|
if (col >= ncols) {
|
@@ -4421,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4421
4539
|
const int i2 = row/p_delta_rows;
|
4422
4540
|
|
4423
4541
|
const int p = has_pos ? pos[i2] : 0;
|
4424
|
-
const float
|
4425
|
-
|
4426
|
-
|
4427
|
-
|
4542
|
+
const float theta_base = p*powf(freq_base, -float(col)/ncols);
|
4543
|
+
|
4544
|
+
float cos_theta, sin_theta;
|
4545
|
+
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4428
4546
|
|
4429
4547
|
const float x0 = x[i + 0];
|
4430
4548
|
const float x1 = x[i + 1];
|
@@ -4434,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4434
4552
|
}
|
4435
4553
|
|
4436
4554
|
template<typename T, bool has_pos>
|
4437
|
-
static __global__ void rope_neox(
|
4438
|
-
|
4555
|
+
static __global__ void rope_neox(
|
4556
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4557
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4558
|
+
) {
|
4439
4559
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4440
4560
|
|
4441
4561
|
if (col >= ncols) {
|
@@ -4446,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4446
4566
|
const int i = row*ncols + col/2;
|
4447
4567
|
const int i2 = row/p_delta_rows;
|
4448
4568
|
|
4569
|
+
// simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
|
4570
|
+
const float cur_rot = -float(col)/ncols;
|
4571
|
+
|
4449
4572
|
const int p = has_pos ? pos[i2] : 0;
|
4450
|
-
const float
|
4451
|
-
|
4452
|
-
|
4453
|
-
|
4573
|
+
const float theta_base = p*powf(freq_base, cur_rot);
|
4574
|
+
|
4575
|
+
float cos_theta, sin_theta;
|
4576
|
+
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4454
4577
|
|
4455
4578
|
const float x0 = x[i + 0];
|
4456
4579
|
const float x1 = x[i + ncols/2];
|
@@ -4459,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4459
4582
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4460
4583
|
}
|
4461
4584
|
|
4462
|
-
static __global__ void rope_glm_f32(
|
4463
|
-
|
4585
|
+
static __global__ void rope_glm_f32(
|
4586
|
+
const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4587
|
+
int n_ctx
|
4588
|
+
) {
|
4464
4589
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4465
4590
|
const int half_n_dims = ncols/4;
|
4466
4591
|
|
@@ -4472,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4472
4597
|
const int i = row*ncols + col;
|
4473
4598
|
const int i2 = row/p_delta_rows;
|
4474
4599
|
|
4475
|
-
const float col_theta_scale = powf(
|
4600
|
+
const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
|
4476
4601
|
// FIXME: this is likely wrong
|
4477
4602
|
const int p = pos != nullptr ? pos[i2] : 0;
|
4478
4603
|
|
@@ -4614,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
|
|
4614
4739
|
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4615
4740
|
}
|
4616
4741
|
|
4742
|
+
static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
4743
|
+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4744
|
+
add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4745
|
+
}
|
4746
|
+
|
4617
4747
|
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4618
4748
|
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
4619
4749
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -5491,40 +5621,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
|
|
5491
5621
|
}
|
5492
5622
|
|
5493
5623
|
template<typename T>
|
5494
|
-
static void rope_cuda(
|
5495
|
-
|
5624
|
+
static void rope_cuda(
|
5625
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5626
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5627
|
+
) {
|
5496
5628
|
GGML_ASSERT(ncols % 2 == 0);
|
5497
5629
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5498
5630
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5499
5631
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5500
5632
|
if (pos == nullptr) {
|
5501
|
-
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5633
|
+
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5634
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5635
|
+
);
|
5502
5636
|
} else {
|
5503
|
-
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5637
|
+
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5638
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5639
|
+
);
|
5504
5640
|
}
|
5505
5641
|
}
|
5506
5642
|
|
5507
5643
|
template<typename T>
|
5508
|
-
static void rope_neox_cuda(
|
5509
|
-
|
5644
|
+
static void rope_neox_cuda(
|
5645
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5646
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5647
|
+
) {
|
5510
5648
|
GGML_ASSERT(ncols % 2 == 0);
|
5511
5649
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5512
5650
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5513
5651
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5514
5652
|
if (pos == nullptr) {
|
5515
|
-
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5653
|
+
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5654
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5655
|
+
);
|
5516
5656
|
} else {
|
5517
|
-
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5657
|
+
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5658
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5659
|
+
);
|
5518
5660
|
}
|
5519
5661
|
}
|
5520
5662
|
|
5521
|
-
static void rope_glm_f32_cuda(
|
5522
|
-
|
5663
|
+
static void rope_glm_f32_cuda(
|
5664
|
+
const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5665
|
+
float freq_base, int n_ctx, cudaStream_t stream
|
5666
|
+
) {
|
5523
5667
|
GGML_ASSERT(ncols % 4 == 0);
|
5524
5668
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5525
5669
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
5526
5670
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
5527
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5671
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
|
5528
5672
|
}
|
5529
5673
|
|
5530
5674
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5661,11 +5805,21 @@ void ggml_init_cublas() {
|
|
5661
5805
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
5662
5806
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5663
5807
|
int64_t total_vram = 0;
|
5808
|
+
#if defined(GGML_CUDA_FORCE_MMQ)
|
5809
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
5810
|
+
#else
|
5811
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
5812
|
+
#endif
|
5813
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
5814
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
5815
|
+
#else
|
5816
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
5817
|
+
#endif
|
5664
5818
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5665
|
-
for (
|
5819
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5666
5820
|
cudaDeviceProp prop;
|
5667
5821
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
5668
|
-
fprintf(stderr, " Device %
|
5822
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
5669
5823
|
|
5670
5824
|
g_tensor_split[id] = total_vram;
|
5671
5825
|
total_vram += prop.totalGlobalMem;
|
@@ -5675,15 +5829,15 @@ void ggml_init_cublas() {
|
|
5675
5829
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
5676
5830
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5677
5831
|
}
|
5678
|
-
for (
|
5832
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5679
5833
|
g_tensor_split[id] /= total_vram;
|
5680
5834
|
}
|
5681
5835
|
|
5682
|
-
for (
|
5836
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5683
5837
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
5684
5838
|
|
5685
5839
|
// create cuda streams
|
5686
|
-
for (
|
5840
|
+
for (int is = 0; is < MAX_STREAMS; ++is) {
|
5687
5841
|
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
5688
5842
|
}
|
5689
5843
|
|
@@ -5907,7 +6061,10 @@ inline void ggml_cuda_op_add(
|
|
5907
6061
|
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5908
6062
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5909
6063
|
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
6064
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6065
|
+
add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5910
6066
|
} else {
|
6067
|
+
fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
|
5911
6068
|
GGML_ASSERT(false);
|
5912
6069
|
}
|
5913
6070
|
|
@@ -6252,16 +6409,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6252
6409
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6253
6410
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6254
6411
|
|
6255
|
-
GGML_ASSERT(src0_dd_i
|
6412
|
+
GGML_ASSERT(src0_dd_i != nullptr);
|
6256
6413
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
6257
|
-
GGML_ASSERT(dst_dd_i
|
6258
|
-
|
6414
|
+
GGML_ASSERT(dst_dd_i != nullptr);
|
6259
6415
|
|
6260
6416
|
const int64_t ne00 = src0->ne[0];
|
6261
|
-
|
6262
6417
|
const int64_t ne10 = src1->ne[0];
|
6263
6418
|
|
6264
6419
|
const int64_t ne0 = dst->ne[0];
|
6420
|
+
|
6265
6421
|
const int64_t row_diff = row_high - row_low;
|
6266
6422
|
|
6267
6423
|
int id;
|
@@ -6346,7 +6502,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6346
6502
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6347
6503
|
row_diff, src1_ncols, ne10,
|
6348
6504
|
&alpha, src0_ddf_i, ne00,
|
6349
|
-
src1_ddf_i,
|
6505
|
+
src1_ddf_i, ne10,
|
6350
6506
|
&beta, dst_dd_i, ldc));
|
6351
6507
|
|
6352
6508
|
if (src0_as != 0) {
|
@@ -6372,17 +6528,20 @@ inline void ggml_cuda_op_rope(
|
|
6372
6528
|
const int64_t ne2 = dst->ne[2];
|
6373
6529
|
const int64_t nrows = ggml_nrows(src0);
|
6374
6530
|
|
6375
|
-
//const int n_past
|
6376
|
-
const int n_dims
|
6377
|
-
const int mode
|
6378
|
-
const int n_ctx
|
6379
|
-
|
6380
|
-
|
6381
|
-
float freq_base, freq_scale;
|
6382
|
-
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
6383
|
-
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
6531
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6532
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
6533
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
6534
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
6535
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
6384
6536
|
|
6385
|
-
|
6537
|
+
// RoPE alteration for extended context
|
6538
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
6539
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
6540
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
6541
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
6542
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
6543
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
6544
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
6386
6545
|
|
6387
6546
|
const int32_t * pos = nullptr;
|
6388
6547
|
if ((mode & 1) == 0) {
|
@@ -6394,24 +6553,39 @@ inline void ggml_cuda_op_rope(
|
|
6394
6553
|
const bool is_neox = mode & 2;
|
6395
6554
|
const bool is_glm = mode & 4;
|
6396
6555
|
|
6556
|
+
rope_corr_dims corr_dims;
|
6557
|
+
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
6558
|
+
|
6397
6559
|
// compute
|
6398
6560
|
if (is_glm) {
|
6399
6561
|
GGML_ASSERT(false);
|
6400
|
-
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01,
|
6562
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6401
6563
|
} else if (is_neox) {
|
6402
6564
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6403
6565
|
if (src0->type == GGML_TYPE_F32) {
|
6404
|
-
rope_neox_cuda(
|
6566
|
+
rope_neox_cuda(
|
6567
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6568
|
+
attn_factor, corr_dims, main_stream
|
6569
|
+
);
|
6405
6570
|
} else if (src0->type == GGML_TYPE_F16) {
|
6406
|
-
rope_neox_cuda(
|
6571
|
+
rope_neox_cuda(
|
6572
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6573
|
+
attn_factor, corr_dims, main_stream
|
6574
|
+
);
|
6407
6575
|
} else {
|
6408
6576
|
GGML_ASSERT(false);
|
6409
6577
|
}
|
6410
6578
|
} else {
|
6411
6579
|
if (src0->type == GGML_TYPE_F32) {
|
6412
|
-
rope_cuda(
|
6580
|
+
rope_cuda(
|
6581
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6582
|
+
attn_factor, corr_dims, main_stream
|
6583
|
+
);
|
6413
6584
|
} else if (src0->type == GGML_TYPE_F16) {
|
6414
|
-
rope_cuda(
|
6585
|
+
rope_cuda(
|
6586
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6587
|
+
attn_factor, corr_dims, main_stream
|
6588
|
+
);
|
6415
6589
|
} else {
|
6416
6590
|
GGML_ASSERT(false);
|
6417
6591
|
}
|
@@ -6522,8 +6696,10 @@ inline void ggml_cuda_op_clamp(
|
|
6522
6696
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6523
6697
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6524
6698
|
|
6525
|
-
|
6526
|
-
|
6699
|
+
float min;
|
6700
|
+
float max;
|
6701
|
+
memcpy(&min, dst->op_params, sizeof(float));
|
6702
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
6527
6703
|
|
6528
6704
|
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6529
6705
|
CUDA_CHECK(cudaGetLastError());
|
@@ -7013,7 +7189,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
7013
7189
|
}
|
7014
7190
|
|
7015
7191
|
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
7016
|
-
GGML_ASSERT(!
|
7192
|
+
GGML_ASSERT(!ggml_is_transposed(src0));
|
7193
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7017
7194
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
7018
7195
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7019
7196
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -7023,11 +7200,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7023
7200
|
const int64_t ne01 = src0->ne[1];
|
7024
7201
|
const int64_t ne02 = src0->ne[2];
|
7025
7202
|
|
7026
|
-
const int64_t ne12 = src1->ne[2];
|
7027
|
-
|
7028
7203
|
const int64_t nb01 = src0->nb[1];
|
7029
7204
|
const int64_t nb02 = src0->nb[2];
|
7030
7205
|
|
7206
|
+
const int64_t ne12 = src1->ne[2];
|
7207
|
+
|
7031
7208
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7032
7209
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7033
7210
|
|
@@ -7046,27 +7223,210 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7046
7223
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7047
7224
|
}
|
7048
7225
|
|
7226
|
+
__global__ void k_compute_batched_ptrs(
|
7227
|
+
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7228
|
+
void ** ptrs,
|
7229
|
+
int ne12, int ne13,
|
7230
|
+
int ne23,
|
7231
|
+
int nb02, int nb03,
|
7232
|
+
int nb12, int nb13,
|
7233
|
+
int nb2, int nb3,
|
7234
|
+
int r2, int r3) {
|
7235
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
7236
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
7237
|
+
|
7238
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
7239
|
+
return;
|
7240
|
+
}
|
7241
|
+
|
7242
|
+
int i03 = i13 / r3;
|
7243
|
+
int i02 = i12 / r2;
|
7244
|
+
|
7245
|
+
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7246
|
+
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7247
|
+
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7248
|
+
}
|
7249
|
+
|
7250
|
+
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7251
|
+
GGML_ASSERT(!ggml_is_transposed(src0));
|
7252
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7253
|
+
|
7254
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7255
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
7256
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7257
|
+
|
7258
|
+
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
7259
|
+
const int64_t ne01 = src0->ne[1];
|
7260
|
+
const int64_t ne02 = src0->ne[2];
|
7261
|
+
const int64_t ne03 = src0->ne[3];
|
7262
|
+
|
7263
|
+
const int64_t nb01 = src0->nb[1];
|
7264
|
+
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
7265
|
+
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
7266
|
+
|
7267
|
+
const int64_t ne10 = src1->ne[0];
|
7268
|
+
const int64_t ne11 = src1->ne[1];
|
7269
|
+
const int64_t ne12 = src1->ne[2];
|
7270
|
+
const int64_t ne13 = src1->ne[3];
|
7271
|
+
|
7272
|
+
const int64_t nb11 = src1->nb[1];
|
7273
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
7274
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
7275
|
+
|
7276
|
+
const int64_t ne1 = ggml_nelements(src1);
|
7277
|
+
const int64_t ne = ggml_nelements(dst);
|
7278
|
+
|
7279
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7280
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7281
|
+
|
7282
|
+
int id;
|
7283
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
7284
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
7285
|
+
|
7286
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7287
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
7288
|
+
half * src0_as_f16 = (half *) src0_ddq;
|
7289
|
+
|
7290
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7291
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7292
|
+
|
7293
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
7294
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
7295
|
+
|
7296
|
+
// convert src1 to fp16
|
7297
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
7298
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
7299
|
+
|
7300
|
+
size_t src1_as = 0;
|
7301
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
7302
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
7303
|
+
|
7304
|
+
size_t dst_as = 0;
|
7305
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
7306
|
+
|
7307
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
7308
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
7309
|
+
|
7310
|
+
// broadcast factors
|
7311
|
+
const int64_t r2 = ne12/ne02;
|
7312
|
+
const int64_t r3 = ne13/ne03;
|
7313
|
+
|
7314
|
+
const half alpha_f16 = 1.0f;
|
7315
|
+
const half beta_f16 = 0.0f;
|
7316
|
+
|
7317
|
+
#if 0
|
7318
|
+
// use cublasGemmEx
|
7319
|
+
{
|
7320
|
+
for (int i13 = 0; i13 < ne13; ++i13) {
|
7321
|
+
for (int i12 = 0; i12 < ne12; ++i12) {
|
7322
|
+
int i03 = i13 / r3;
|
7323
|
+
int i02 = i12 / r2;
|
7324
|
+
|
7325
|
+
CUBLAS_CHECK(
|
7326
|
+
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7327
|
+
ne01, ne11, ne10,
|
7328
|
+
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
7329
|
+
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
7330
|
+
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
|
7331
|
+
CUBLAS_COMPUTE_16F,
|
7332
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7333
|
+
}
|
7334
|
+
}
|
7335
|
+
}
|
7336
|
+
#else
|
7337
|
+
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
7338
|
+
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7339
|
+
// use cublasGemmStridedBatchedEx
|
7340
|
+
CUBLAS_CHECK(
|
7341
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7342
|
+
ne01, ne11, ne10,
|
7343
|
+
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7344
|
+
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
7345
|
+
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
|
7346
|
+
ne12*ne13,
|
7347
|
+
CUBLAS_COMPUTE_16F,
|
7348
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7349
|
+
} else {
|
7350
|
+
// use cublasGemmBatchedEx
|
7351
|
+
const int ne23 = ne12*ne13;
|
7352
|
+
|
7353
|
+
void ** ptrs_as = nullptr;
|
7354
|
+
size_t ptrs_s = 0;
|
7355
|
+
ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
7356
|
+
|
7357
|
+
dim3 block_dims(ne13, ne12);
|
7358
|
+
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7359
|
+
src0_as_f16, src1_as_f16, dst_f16,
|
7360
|
+
ptrs_as,
|
7361
|
+
ne12, ne13,
|
7362
|
+
ne23,
|
7363
|
+
nb02, nb03,
|
7364
|
+
nb12, nb13,
|
7365
|
+
dst->nb[2], dst->nb[3],
|
7366
|
+
r2, r3);
|
7367
|
+
CUDA_CHECK(cudaGetLastError());
|
7368
|
+
|
7369
|
+
CUBLAS_CHECK(
|
7370
|
+
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7371
|
+
ne01, ne11, ne10,
|
7372
|
+
&alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7373
|
+
(const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7374
|
+
&beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
7375
|
+
ne23,
|
7376
|
+
CUBLAS_COMPUTE_16F,
|
7377
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7378
|
+
|
7379
|
+
ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
7380
|
+
}
|
7381
|
+
#endif
|
7382
|
+
|
7383
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
7384
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
7385
|
+
|
7386
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
7387
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
7388
|
+
}
|
7389
|
+
|
7049
7390
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7050
|
-
bool all_on_device =
|
7051
|
-
|
7391
|
+
const bool all_on_device =
|
7392
|
+
(src0->backend == GGML_BACKEND_GPU) &&
|
7393
|
+
(src1->backend == GGML_BACKEND_GPU) &&
|
7394
|
+
( dst->backend == GGML_BACKEND_GPU);
|
7052
7395
|
|
7053
7396
|
int64_t min_compute_capability = INT_MAX;
|
7054
7397
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7055
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
7056
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7398
|
+
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7057
7399
|
min_compute_capability = g_compute_capabilities[id];
|
7058
7400
|
}
|
7059
7401
|
}
|
7060
7402
|
|
7061
|
-
|
7403
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
7404
|
+
const bool use_tensor_cores = true;
|
7405
|
+
#else
|
7406
|
+
const bool use_tensor_cores = false;
|
7407
|
+
#endif
|
7408
|
+
|
7409
|
+
// debug helpers
|
7410
|
+
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
7411
|
+
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
7412
|
+
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
7413
|
+
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
7414
|
+
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7415
|
+
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7416
|
+
|
7417
|
+
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7418
|
+
// KQ single-batch
|
7062
7419
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7063
|
-
} else if (all_on_device && !ggml_is_contiguous(src0) &&
|
7420
|
+
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7421
|
+
// KQV single-batch
|
7064
7422
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7423
|
+
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7424
|
+
// KQ + KQV multi-batch
|
7425
|
+
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7065
7426
|
} else if (src0->type == GGML_TYPE_F32) {
|
7066
7427
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7067
7428
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
7068
7429
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
7069
|
-
|
7070
7430
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7071
7431
|
const bool use_mul_mat_vec_q = false;
|
7072
7432
|
#else
|
@@ -7079,7 +7439,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7079
7439
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
7080
7440
|
}
|
7081
7441
|
} else {
|
7082
|
-
|
7442
|
+
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
7443
|
+
|
7444
|
+
// when tensor cores are available, use them for large batch size
|
7445
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
7446
|
+
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
7447
|
+
use_mul_mat_q = false;
|
7448
|
+
}
|
7449
|
+
|
7450
|
+
if (use_mul_mat_q) {
|
7083
7451
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
7084
7452
|
} else {
|
7085
7453
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
@@ -7433,10 +7801,6 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
7433
7801
|
}
|
7434
7802
|
}
|
7435
7803
|
|
7436
|
-
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
7437
|
-
g_mul_mat_q = mul_mat_q;
|
7438
|
-
}
|
7439
|
-
|
7440
7804
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7441
7805
|
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
7442
7806
|
// it still won't always work as expected, but it's better than nothing
|