llama_cpp 0.8.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +228 -165
- data/ext/llama_cpp/src/ggml-cuda.cu +441 -77
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +71 -42
- data/ext/llama_cpp/src/ggml-metal.metal +171 -35
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +1303 -3419
- data/ext/llama_cpp/src/ggml.h +33 -11
- data/ext/llama_cpp/src/llama.cpp +1925 -2655
- data/ext/llama_cpp/src/llama.h +48 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +34 -14
- metadata +5 -4
@@ -29,6 +29,8 @@
|
|
29
29
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
30
30
|
#define cublasCreate hipblasCreate
|
31
31
|
#define cublasGemmEx hipblasGemmEx
|
32
|
+
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
33
|
+
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
32
34
|
#define cublasHandle_t hipblasHandle_t
|
33
35
|
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
34
36
|
#define cublasSetStream hipblasSetStream
|
@@ -85,6 +87,24 @@
|
|
85
87
|
#define CC_OFFSET_AMD 1000000
|
86
88
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
87
89
|
|
90
|
+
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
|
+
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
|
+
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
93
|
+
// - 7B quantum model: +100-200 MB
|
94
|
+
// - 13B quantum model: +200-400 MB
|
95
|
+
//
|
96
|
+
//#define GGML_CUDA_FORCE_MMQ
|
97
|
+
|
98
|
+
// TODO: improve this to be correct for more hardware
|
99
|
+
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
100
|
+
// probably other such cases, and not sure what happens on AMD hardware
|
101
|
+
#if !defined(GGML_CUDA_FORCE_MMQ)
|
102
|
+
#define CUDA_USE_TENSOR_CORES
|
103
|
+
#endif
|
104
|
+
|
105
|
+
// max batch size to use MMQ kernels when tensor cores are available
|
106
|
+
#define MMQ_MAX_BATCH_SIZE 32
|
107
|
+
|
88
108
|
#if defined(GGML_USE_HIPBLAS)
|
89
109
|
#define __CUDA_ARCH__ 1300
|
90
110
|
|
@@ -468,7 +488,6 @@ static int g_device_count = -1;
|
|
468
488
|
static int g_main_device = 0;
|
469
489
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
470
490
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
471
|
-
static bool g_mul_mat_q = true;
|
472
491
|
|
473
492
|
static void * g_scratch_buffer = nullptr;
|
474
493
|
static size_t g_scratch_size = 0; // disabled by default
|
@@ -494,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
|
|
494
513
|
dst[i] = __hadd(x[i], __float2half(y[i]));
|
495
514
|
}
|
496
515
|
|
516
|
+
static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
|
517
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
518
|
+
|
519
|
+
if (i >= k) {
|
520
|
+
return;
|
521
|
+
}
|
522
|
+
dst[i] = __half2float(x[i]) + y[i];
|
523
|
+
}
|
524
|
+
|
497
525
|
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
498
526
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
499
527
|
|
@@ -3552,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3552
3580
|
#define MMQ_X_Q4_0_RDNA1 64
|
3553
3581
|
#define MMQ_Y_Q4_0_RDNA1 64
|
3554
3582
|
#define NWARPS_Q4_0_RDNA1 8
|
3583
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3584
|
+
#define MMQ_X_Q4_0_AMPERE 4
|
3585
|
+
#define MMQ_Y_Q4_0_AMPERE 32
|
3586
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3587
|
+
#else
|
3555
3588
|
#define MMQ_X_Q4_0_AMPERE 64
|
3556
3589
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3557
3590
|
#define NWARPS_Q4_0_AMPERE 4
|
3591
|
+
#endif
|
3558
3592
|
#define MMQ_X_Q4_0_PASCAL 64
|
3559
3593
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3560
3594
|
#define NWARPS_Q4_0_PASCAL 8
|
@@ -3613,9 +3647,15 @@ template <bool need_check> static __global__ void
|
|
3613
3647
|
#define MMQ_X_Q4_1_RDNA1 64
|
3614
3648
|
#define MMQ_Y_Q4_1_RDNA1 64
|
3615
3649
|
#define NWARPS_Q4_1_RDNA1 8
|
3650
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3651
|
+
#define MMQ_X_Q4_1_AMPERE 4
|
3652
|
+
#define MMQ_Y_Q4_1_AMPERE 32
|
3653
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3654
|
+
#else
|
3616
3655
|
#define MMQ_X_Q4_1_AMPERE 64
|
3617
3656
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3618
3657
|
#define NWARPS_Q4_1_AMPERE 4
|
3658
|
+
#endif
|
3619
3659
|
#define MMQ_X_Q4_1_PASCAL 64
|
3620
3660
|
#define MMQ_Y_Q4_1_PASCAL 64
|
3621
3661
|
#define NWARPS_Q4_1_PASCAL 8
|
@@ -3676,9 +3716,15 @@ template <bool need_check> static __global__ void
|
|
3676
3716
|
#define MMQ_X_Q5_0_RDNA1 64
|
3677
3717
|
#define MMQ_Y_Q5_0_RDNA1 64
|
3678
3718
|
#define NWARPS_Q5_0_RDNA1 8
|
3719
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3720
|
+
#define MMQ_X_Q5_0_AMPERE 4
|
3721
|
+
#define MMQ_Y_Q5_0_AMPERE 32
|
3722
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3723
|
+
#else
|
3679
3724
|
#define MMQ_X_Q5_0_AMPERE 128
|
3680
3725
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3681
3726
|
#define NWARPS_Q5_0_AMPERE 4
|
3727
|
+
#endif
|
3682
3728
|
#define MMQ_X_Q5_0_PASCAL 64
|
3683
3729
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3684
3730
|
#define NWARPS_Q5_0_PASCAL 8
|
@@ -3737,9 +3783,15 @@ template <bool need_check> static __global__ void
|
|
3737
3783
|
#define MMQ_X_Q5_1_RDNA1 64
|
3738
3784
|
#define MMQ_Y_Q5_1_RDNA1 64
|
3739
3785
|
#define NWARPS_Q5_1_RDNA1 8
|
3786
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3787
|
+
#define MMQ_X_Q5_1_AMPERE 4
|
3788
|
+
#define MMQ_Y_Q5_1_AMPERE 32
|
3789
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3790
|
+
#else
|
3740
3791
|
#define MMQ_X_Q5_1_AMPERE 128
|
3741
3792
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3742
3793
|
#define NWARPS_Q5_1_AMPERE 4
|
3794
|
+
#endif
|
3743
3795
|
#define MMQ_X_Q5_1_PASCAL 64
|
3744
3796
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3745
3797
|
#define NWARPS_Q5_1_PASCAL 8
|
@@ -3798,9 +3850,15 @@ mul_mat_q5_1(
|
|
3798
3850
|
#define MMQ_X_Q8_0_RDNA1 64
|
3799
3851
|
#define MMQ_Y_Q8_0_RDNA1 64
|
3800
3852
|
#define NWARPS_Q8_0_RDNA1 8
|
3853
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3854
|
+
#define MMQ_X_Q8_0_AMPERE 4
|
3855
|
+
#define MMQ_Y_Q8_0_AMPERE 32
|
3856
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3857
|
+
#else
|
3801
3858
|
#define MMQ_X_Q8_0_AMPERE 128
|
3802
3859
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3803
3860
|
#define NWARPS_Q8_0_AMPERE 4
|
3861
|
+
#endif
|
3804
3862
|
#define MMQ_X_Q8_0_PASCAL 64
|
3805
3863
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3806
3864
|
#define NWARPS_Q8_0_PASCAL 8
|
@@ -3859,9 +3917,15 @@ template <bool need_check> static __global__ void
|
|
3859
3917
|
#define MMQ_X_Q2_K_RDNA1 128
|
3860
3918
|
#define MMQ_Y_Q2_K_RDNA1 32
|
3861
3919
|
#define NWARPS_Q2_K_RDNA1 8
|
3920
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3921
|
+
#define MMQ_X_Q2_K_AMPERE 4
|
3922
|
+
#define MMQ_Y_Q2_K_AMPERE 32
|
3923
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3924
|
+
#else
|
3862
3925
|
#define MMQ_X_Q2_K_AMPERE 64
|
3863
3926
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3864
3927
|
#define NWARPS_Q2_K_AMPERE 4
|
3928
|
+
#endif
|
3865
3929
|
#define MMQ_X_Q2_K_PASCAL 64
|
3866
3930
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3867
3931
|
#define NWARPS_Q2_K_PASCAL 8
|
@@ -3920,9 +3984,15 @@ mul_mat_q2_K(
|
|
3920
3984
|
#define MMQ_X_Q3_K_RDNA1 32
|
3921
3985
|
#define MMQ_Y_Q3_K_RDNA1 128
|
3922
3986
|
#define NWARPS_Q3_K_RDNA1 8
|
3987
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
3988
|
+
#define MMQ_X_Q3_K_AMPERE 4
|
3989
|
+
#define MMQ_Y_Q3_K_AMPERE 32
|
3990
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3991
|
+
#else
|
3923
3992
|
#define MMQ_X_Q3_K_AMPERE 128
|
3924
3993
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3925
3994
|
#define NWARPS_Q3_K_AMPERE 4
|
3995
|
+
#endif
|
3926
3996
|
#define MMQ_X_Q3_K_PASCAL 64
|
3927
3997
|
#define MMQ_Y_Q3_K_PASCAL 64
|
3928
3998
|
#define NWARPS_Q3_K_PASCAL 8
|
@@ -3983,9 +4053,15 @@ template <bool need_check> static __global__ void
|
|
3983
4053
|
#define MMQ_X_Q4_K_RDNA1 32
|
3984
4054
|
#define MMQ_Y_Q4_K_RDNA1 64
|
3985
4055
|
#define NWARPS_Q4_K_RDNA1 8
|
4056
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4057
|
+
#define MMQ_X_Q4_K_AMPERE 4
|
4058
|
+
#define MMQ_Y_Q4_K_AMPERE 32
|
4059
|
+
#define NWARPS_Q4_K_AMPERE 4
|
4060
|
+
#else
|
3986
4061
|
#define MMQ_X_Q4_K_AMPERE 64
|
3987
4062
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3988
4063
|
#define NWARPS_Q4_K_AMPERE 4
|
4064
|
+
#endif
|
3989
4065
|
#define MMQ_X_Q4_K_PASCAL 64
|
3990
4066
|
#define MMQ_Y_Q4_K_PASCAL 64
|
3991
4067
|
#define NWARPS_Q4_K_PASCAL 8
|
@@ -4046,9 +4122,15 @@ template <bool need_check> static __global__ void
|
|
4046
4122
|
#define MMQ_X_Q5_K_RDNA1 32
|
4047
4123
|
#define MMQ_Y_Q5_K_RDNA1 64
|
4048
4124
|
#define NWARPS_Q5_K_RDNA1 8
|
4125
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4126
|
+
#define MMQ_X_Q5_K_AMPERE 4
|
4127
|
+
#define MMQ_Y_Q5_K_AMPERE 32
|
4128
|
+
#define NWARPS_Q5_K_AMPERE 4
|
4129
|
+
#else
|
4049
4130
|
#define MMQ_X_Q5_K_AMPERE 64
|
4050
4131
|
#define MMQ_Y_Q5_K_AMPERE 128
|
4051
4132
|
#define NWARPS_Q5_K_AMPERE 4
|
4133
|
+
#endif
|
4052
4134
|
#define MMQ_X_Q5_K_PASCAL 64
|
4053
4135
|
#define MMQ_Y_Q5_K_PASCAL 64
|
4054
4136
|
#define NWARPS_Q5_K_PASCAL 8
|
@@ -4107,9 +4189,15 @@ mul_mat_q5_K(
|
|
4107
4189
|
#define MMQ_X_Q6_K_RDNA1 32
|
4108
4190
|
#define MMQ_Y_Q6_K_RDNA1 64
|
4109
4191
|
#define NWARPS_Q6_K_RDNA1 8
|
4192
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
4193
|
+
#define MMQ_X_Q6_K_AMPERE 4
|
4194
|
+
#define MMQ_Y_Q6_K_AMPERE 32
|
4195
|
+
#define NWARPS_Q6_K_AMPERE 4
|
4196
|
+
#else
|
4110
4197
|
#define MMQ_X_Q6_K_AMPERE 64
|
4111
4198
|
#define MMQ_Y_Q6_K_AMPERE 64
|
4112
4199
|
#define NWARPS_Q6_K_AMPERE 4
|
4200
|
+
#endif
|
4113
4201
|
#define MMQ_X_Q6_K_PASCAL 64
|
4114
4202
|
#define MMQ_Y_Q6_K_PASCAL 64
|
4115
4203
|
#define NWARPS_Q6_K_PASCAL 8
|
@@ -4326,13 +4414,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
4326
4414
|
|
4327
4415
|
const half * x = (const half *) vx;
|
4328
4416
|
|
4329
|
-
const int row_x
|
4330
|
-
const int channel
|
4417
|
+
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
4418
|
+
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
4331
4419
|
const int channel_x = channel / channel_x_divisor;
|
4332
4420
|
|
4333
|
-
const int nrows_y
|
4421
|
+
const int nrows_y = ncols_x;
|
4334
4422
|
const int nrows_dst = nrows_x;
|
4335
|
-
const int row_dst
|
4423
|
+
const int row_dst = row_x;
|
4336
4424
|
|
4337
4425
|
const int idst = channel*nrows_dst + row_dst;
|
4338
4426
|
|
@@ -4345,13 +4433,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
4345
4433
|
break;
|
4346
4434
|
}
|
4347
4435
|
|
4348
|
-
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
4349
|
-
const float xi = __half2float(x[ix]);
|
4350
|
-
|
4351
4436
|
const int row_y = col_x;
|
4352
4437
|
|
4438
|
+
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
4353
4439
|
const int iy = channel*nrows_y + row_y;
|
4354
4440
|
|
4441
|
+
const float xi = __half2float(x[ix]);
|
4442
|
+
|
4355
4443
|
tmp += xi * y[iy];
|
4356
4444
|
}
|
4357
4445
|
|
@@ -4405,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4405
4493
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
4406
4494
|
}
|
4407
4495
|
|
4408
|
-
|
4496
|
+
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
4497
|
+
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
4498
|
+
return 1.0f - min(1.0f, max(0.0f, y));
|
4499
|
+
}
|
4500
|
+
|
4501
|
+
struct rope_corr_dims {
|
4502
|
+
float v[4];
|
4503
|
+
};
|
4504
|
+
|
4505
|
+
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
4506
|
+
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
4507
|
+
static __device__ void rope_yarn(
|
4508
|
+
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
4509
|
+
float * cos_theta, float * sin_theta
|
4510
|
+
) {
|
4511
|
+
// Get n-d rotational scaling corrected for extrapolation
|
4512
|
+
float theta_interp = freq_scale * theta_extrap;
|
4513
|
+
float theta = theta_interp;
|
4514
|
+
if (ext_factor != 0.0f) {
|
4515
|
+
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
4516
|
+
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
4517
|
+
|
4518
|
+
// Get n-d magnitude scaling corrected for interpolation
|
4519
|
+
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
4520
|
+
}
|
4521
|
+
*cos_theta = cosf(theta) * mscale;
|
4522
|
+
*sin_theta = sinf(theta) * mscale;
|
4523
|
+
}
|
4409
4524
|
|
4525
|
+
// rope == RoPE == rotary positional embedding
|
4410
4526
|
template<typename T, bool has_pos>
|
4411
|
-
static __global__ void rope(
|
4412
|
-
|
4527
|
+
static __global__ void rope(
|
4528
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4529
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4530
|
+
) {
|
4413
4531
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4414
4532
|
|
4415
4533
|
if (col >= ncols) {
|
@@ -4421,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4421
4539
|
const int i2 = row/p_delta_rows;
|
4422
4540
|
|
4423
4541
|
const int p = has_pos ? pos[i2] : 0;
|
4424
|
-
const float
|
4425
|
-
|
4426
|
-
|
4427
|
-
|
4542
|
+
const float theta_base = p*powf(freq_base, -float(col)/ncols);
|
4543
|
+
|
4544
|
+
float cos_theta, sin_theta;
|
4545
|
+
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4428
4546
|
|
4429
4547
|
const float x0 = x[i + 0];
|
4430
4548
|
const float x1 = x[i + 1];
|
@@ -4434,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
|
4434
4552
|
}
|
4435
4553
|
|
4436
4554
|
template<typename T, bool has_pos>
|
4437
|
-
static __global__ void rope_neox(
|
4438
|
-
|
4555
|
+
static __global__ void rope_neox(
|
4556
|
+
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4557
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4558
|
+
) {
|
4439
4559
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4440
4560
|
|
4441
4561
|
if (col >= ncols) {
|
@@ -4446,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4446
4566
|
const int i = row*ncols + col/2;
|
4447
4567
|
const int i2 = row/p_delta_rows;
|
4448
4568
|
|
4569
|
+
// simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
|
4570
|
+
const float cur_rot = -float(col)/ncols;
|
4571
|
+
|
4449
4572
|
const int p = has_pos ? pos[i2] : 0;
|
4450
|
-
const float
|
4451
|
-
|
4452
|
-
|
4453
|
-
|
4573
|
+
const float theta_base = p*powf(freq_base, cur_rot);
|
4574
|
+
|
4575
|
+
float cos_theta, sin_theta;
|
4576
|
+
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4454
4577
|
|
4455
4578
|
const float x0 = x[i + 0];
|
4456
4579
|
const float x1 = x[i + ncols/2];
|
@@ -4459,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
|
4459
4582
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4460
4583
|
}
|
4461
4584
|
|
4462
|
-
static __global__ void rope_glm_f32(
|
4463
|
-
|
4585
|
+
static __global__ void rope_glm_f32(
|
4586
|
+
const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
4587
|
+
int n_ctx
|
4588
|
+
) {
|
4464
4589
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4465
4590
|
const int half_n_dims = ncols/4;
|
4466
4591
|
|
@@ -4472,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4472
4597
|
const int i = row*ncols + col;
|
4473
4598
|
const int i2 = row/p_delta_rows;
|
4474
4599
|
|
4475
|
-
const float col_theta_scale = powf(
|
4600
|
+
const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
|
4476
4601
|
// FIXME: this is likely wrong
|
4477
4602
|
const int p = pos != nullptr ? pos[i2] : 0;
|
4478
4603
|
|
@@ -4614,6 +4739,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
|
|
4614
4739
|
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4615
4740
|
}
|
4616
4741
|
|
4742
|
+
static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
4743
|
+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4744
|
+
add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
4745
|
+
}
|
4746
|
+
|
4617
4747
|
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4618
4748
|
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
4619
4749
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -5491,40 +5621,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
|
|
5491
5621
|
}
|
5492
5622
|
|
5493
5623
|
template<typename T>
|
5494
|
-
static void rope_cuda(
|
5495
|
-
|
5624
|
+
static void rope_cuda(
|
5625
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5626
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5627
|
+
) {
|
5496
5628
|
GGML_ASSERT(ncols % 2 == 0);
|
5497
5629
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5498
5630
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5499
5631
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5500
5632
|
if (pos == nullptr) {
|
5501
|
-
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5633
|
+
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5634
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5635
|
+
);
|
5502
5636
|
} else {
|
5503
|
-
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5637
|
+
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5638
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5639
|
+
);
|
5504
5640
|
}
|
5505
5641
|
}
|
5506
5642
|
|
5507
5643
|
template<typename T>
|
5508
|
-
static void rope_neox_cuda(
|
5509
|
-
|
5644
|
+
static void rope_neox_cuda(
|
5645
|
+
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5646
|
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5647
|
+
) {
|
5510
5648
|
GGML_ASSERT(ncols % 2 == 0);
|
5511
5649
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5512
5650
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5513
5651
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5514
5652
|
if (pos == nullptr) {
|
5515
|
-
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5653
|
+
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5654
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5655
|
+
);
|
5516
5656
|
} else {
|
5517
|
-
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5657
|
+
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5658
|
+
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
5659
|
+
);
|
5518
5660
|
}
|
5519
5661
|
}
|
5520
5662
|
|
5521
|
-
static void rope_glm_f32_cuda(
|
5522
|
-
|
5663
|
+
static void rope_glm_f32_cuda(
|
5664
|
+
const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5665
|
+
float freq_base, int n_ctx, cudaStream_t stream
|
5666
|
+
) {
|
5523
5667
|
GGML_ASSERT(ncols % 4 == 0);
|
5524
5668
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5525
5669
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
5526
5670
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
5527
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5671
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
|
5528
5672
|
}
|
5529
5673
|
|
5530
5674
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5661,11 +5805,21 @@ void ggml_init_cublas() {
|
|
5661
5805
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
5662
5806
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5663
5807
|
int64_t total_vram = 0;
|
5808
|
+
#if defined(GGML_CUDA_FORCE_MMQ)
|
5809
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
5810
|
+
#else
|
5811
|
+
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
5812
|
+
#endif
|
5813
|
+
#if defined(CUDA_USE_TENSOR_CORES)
|
5814
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
5815
|
+
#else
|
5816
|
+
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
5817
|
+
#endif
|
5664
5818
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5665
|
-
for (
|
5819
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5666
5820
|
cudaDeviceProp prop;
|
5667
5821
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
5668
|
-
fprintf(stderr, " Device %
|
5822
|
+
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
5669
5823
|
|
5670
5824
|
g_tensor_split[id] = total_vram;
|
5671
5825
|
total_vram += prop.totalGlobalMem;
|
@@ -5675,15 +5829,15 @@ void ggml_init_cublas() {
|
|
5675
5829
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
5676
5830
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5677
5831
|
}
|
5678
|
-
for (
|
5832
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5679
5833
|
g_tensor_split[id] /= total_vram;
|
5680
5834
|
}
|
5681
5835
|
|
5682
|
-
for (
|
5836
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5683
5837
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
5684
5838
|
|
5685
5839
|
// create cuda streams
|
5686
|
-
for (
|
5840
|
+
for (int is = 0; is < MAX_STREAMS; ++is) {
|
5687
5841
|
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
5688
5842
|
}
|
5689
5843
|
|
@@ -5907,7 +6061,10 @@ inline void ggml_cuda_op_add(
|
|
5907
6061
|
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5908
6062
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5909
6063
|
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
6064
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
6065
|
+
add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5910
6066
|
} else {
|
6067
|
+
fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
|
5911
6068
|
GGML_ASSERT(false);
|
5912
6069
|
}
|
5913
6070
|
|
@@ -6252,16 +6409,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6252
6409
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6253
6410
|
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
6254
6411
|
|
6255
|
-
GGML_ASSERT(src0_dd_i
|
6412
|
+
GGML_ASSERT(src0_dd_i != nullptr);
|
6256
6413
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
6257
|
-
GGML_ASSERT(dst_dd_i
|
6258
|
-
|
6414
|
+
GGML_ASSERT(dst_dd_i != nullptr);
|
6259
6415
|
|
6260
6416
|
const int64_t ne00 = src0->ne[0];
|
6261
|
-
|
6262
6417
|
const int64_t ne10 = src1->ne[0];
|
6263
6418
|
|
6264
6419
|
const int64_t ne0 = dst->ne[0];
|
6420
|
+
|
6265
6421
|
const int64_t row_diff = row_high - row_low;
|
6266
6422
|
|
6267
6423
|
int id;
|
@@ -6346,7 +6502,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6346
6502
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6347
6503
|
row_diff, src1_ncols, ne10,
|
6348
6504
|
&alpha, src0_ddf_i, ne00,
|
6349
|
-
src1_ddf_i,
|
6505
|
+
src1_ddf_i, ne10,
|
6350
6506
|
&beta, dst_dd_i, ldc));
|
6351
6507
|
|
6352
6508
|
if (src0_as != 0) {
|
@@ -6372,17 +6528,20 @@ inline void ggml_cuda_op_rope(
|
|
6372
6528
|
const int64_t ne2 = dst->ne[2];
|
6373
6529
|
const int64_t nrows = ggml_nrows(src0);
|
6374
6530
|
|
6375
|
-
//const int n_past
|
6376
|
-
const int n_dims
|
6377
|
-
const int mode
|
6378
|
-
const int n_ctx
|
6379
|
-
|
6380
|
-
|
6381
|
-
float freq_base, freq_scale;
|
6382
|
-
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
6383
|
-
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
6531
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6532
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
6533
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
6534
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
6535
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
6384
6536
|
|
6385
|
-
|
6537
|
+
// RoPE alteration for extended context
|
6538
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
6539
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
6540
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
6541
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
6542
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
6543
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
6544
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
6386
6545
|
|
6387
6546
|
const int32_t * pos = nullptr;
|
6388
6547
|
if ((mode & 1) == 0) {
|
@@ -6394,24 +6553,39 @@ inline void ggml_cuda_op_rope(
|
|
6394
6553
|
const bool is_neox = mode & 2;
|
6395
6554
|
const bool is_glm = mode & 4;
|
6396
6555
|
|
6556
|
+
rope_corr_dims corr_dims;
|
6557
|
+
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
6558
|
+
|
6397
6559
|
// compute
|
6398
6560
|
if (is_glm) {
|
6399
6561
|
GGML_ASSERT(false);
|
6400
|
-
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01,
|
6562
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6401
6563
|
} else if (is_neox) {
|
6402
6564
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6403
6565
|
if (src0->type == GGML_TYPE_F32) {
|
6404
|
-
rope_neox_cuda(
|
6566
|
+
rope_neox_cuda(
|
6567
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6568
|
+
attn_factor, corr_dims, main_stream
|
6569
|
+
);
|
6405
6570
|
} else if (src0->type == GGML_TYPE_F16) {
|
6406
|
-
rope_neox_cuda(
|
6571
|
+
rope_neox_cuda(
|
6572
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6573
|
+
attn_factor, corr_dims, main_stream
|
6574
|
+
);
|
6407
6575
|
} else {
|
6408
6576
|
GGML_ASSERT(false);
|
6409
6577
|
}
|
6410
6578
|
} else {
|
6411
6579
|
if (src0->type == GGML_TYPE_F32) {
|
6412
|
-
rope_cuda(
|
6580
|
+
rope_cuda(
|
6581
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6582
|
+
attn_factor, corr_dims, main_stream
|
6583
|
+
);
|
6413
6584
|
} else if (src0->type == GGML_TYPE_F16) {
|
6414
|
-
rope_cuda(
|
6585
|
+
rope_cuda(
|
6586
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6587
|
+
attn_factor, corr_dims, main_stream
|
6588
|
+
);
|
6415
6589
|
} else {
|
6416
6590
|
GGML_ASSERT(false);
|
6417
6591
|
}
|
@@ -6522,8 +6696,10 @@ inline void ggml_cuda_op_clamp(
|
|
6522
6696
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6523
6697
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6524
6698
|
|
6525
|
-
|
6526
|
-
|
6699
|
+
float min;
|
6700
|
+
float max;
|
6701
|
+
memcpy(&min, dst->op_params, sizeof(float));
|
6702
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
6527
6703
|
|
6528
6704
|
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6529
6705
|
CUDA_CHECK(cudaGetLastError());
|
@@ -7013,7 +7189,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
7013
7189
|
}
|
7014
7190
|
|
7015
7191
|
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
7016
|
-
GGML_ASSERT(!
|
7192
|
+
GGML_ASSERT(!ggml_is_transposed(src0));
|
7193
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7017
7194
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
7018
7195
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7019
7196
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -7023,11 +7200,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7023
7200
|
const int64_t ne01 = src0->ne[1];
|
7024
7201
|
const int64_t ne02 = src0->ne[2];
|
7025
7202
|
|
7026
|
-
const int64_t ne12 = src1->ne[2];
|
7027
|
-
|
7028
7203
|
const int64_t nb01 = src0->nb[1];
|
7029
7204
|
const int64_t nb02 = src0->nb[2];
|
7030
7205
|
|
7206
|
+
const int64_t ne12 = src1->ne[2];
|
7207
|
+
|
7031
7208
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7032
7209
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7033
7210
|
|
@@ -7046,27 +7223,210 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7046
7223
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7047
7224
|
}
|
7048
7225
|
|
7226
|
+
__global__ void k_compute_batched_ptrs(
|
7227
|
+
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7228
|
+
void ** ptrs,
|
7229
|
+
int ne12, int ne13,
|
7230
|
+
int ne23,
|
7231
|
+
int nb02, int nb03,
|
7232
|
+
int nb12, int nb13,
|
7233
|
+
int nb2, int nb3,
|
7234
|
+
int r2, int r3) {
|
7235
|
+
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
7236
|
+
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
7237
|
+
|
7238
|
+
if (i13 >= ne13 || i12 >= ne12) {
|
7239
|
+
return;
|
7240
|
+
}
|
7241
|
+
|
7242
|
+
int i03 = i13 / r3;
|
7243
|
+
int i02 = i12 / r2;
|
7244
|
+
|
7245
|
+
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
7246
|
+
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
7247
|
+
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
7248
|
+
}
|
7249
|
+
|
7250
|
+
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7251
|
+
GGML_ASSERT(!ggml_is_transposed(src0));
|
7252
|
+
GGML_ASSERT(!ggml_is_transposed(src1));
|
7253
|
+
|
7254
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
7255
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
7256
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7257
|
+
|
7258
|
+
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
7259
|
+
const int64_t ne01 = src0->ne[1];
|
7260
|
+
const int64_t ne02 = src0->ne[2];
|
7261
|
+
const int64_t ne03 = src0->ne[3];
|
7262
|
+
|
7263
|
+
const int64_t nb01 = src0->nb[1];
|
7264
|
+
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
7265
|
+
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
7266
|
+
|
7267
|
+
const int64_t ne10 = src1->ne[0];
|
7268
|
+
const int64_t ne11 = src1->ne[1];
|
7269
|
+
const int64_t ne12 = src1->ne[2];
|
7270
|
+
const int64_t ne13 = src1->ne[3];
|
7271
|
+
|
7272
|
+
const int64_t nb11 = src1->nb[1];
|
7273
|
+
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
7274
|
+
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
7275
|
+
|
7276
|
+
const int64_t ne1 = ggml_nelements(src1);
|
7277
|
+
const int64_t ne = ggml_nelements(dst);
|
7278
|
+
|
7279
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7280
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7281
|
+
|
7282
|
+
int id;
|
7283
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
7284
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
7285
|
+
|
7286
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7287
|
+
void * src0_ddq = src0_extra->data_device[g_main_device];
|
7288
|
+
half * src0_as_f16 = (half *) src0_ddq;
|
7289
|
+
|
7290
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7291
|
+
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7292
|
+
|
7293
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
7294
|
+
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
7295
|
+
|
7296
|
+
// convert src1 to fp16
|
7297
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
7298
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
7299
|
+
|
7300
|
+
size_t src1_as = 0;
|
7301
|
+
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
7302
|
+
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
7303
|
+
|
7304
|
+
size_t dst_as = 0;
|
7305
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
7306
|
+
|
7307
|
+
GGML_ASSERT(ne12 % ne02 == 0);
|
7308
|
+
GGML_ASSERT(ne13 % ne03 == 0);
|
7309
|
+
|
7310
|
+
// broadcast factors
|
7311
|
+
const int64_t r2 = ne12/ne02;
|
7312
|
+
const int64_t r3 = ne13/ne03;
|
7313
|
+
|
7314
|
+
const half alpha_f16 = 1.0f;
|
7315
|
+
const half beta_f16 = 0.0f;
|
7316
|
+
|
7317
|
+
#if 0
|
7318
|
+
// use cublasGemmEx
|
7319
|
+
{
|
7320
|
+
for (int i13 = 0; i13 < ne13; ++i13) {
|
7321
|
+
for (int i12 = 0; i12 < ne12; ++i12) {
|
7322
|
+
int i03 = i13 / r3;
|
7323
|
+
int i02 = i12 / r2;
|
7324
|
+
|
7325
|
+
CUBLAS_CHECK(
|
7326
|
+
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7327
|
+
ne01, ne11, ne10,
|
7328
|
+
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
7329
|
+
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
7330
|
+
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
|
7331
|
+
CUBLAS_COMPUTE_16F,
|
7332
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7333
|
+
}
|
7334
|
+
}
|
7335
|
+
}
|
7336
|
+
#else
|
7337
|
+
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
7338
|
+
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
7339
|
+
// use cublasGemmStridedBatchedEx
|
7340
|
+
CUBLAS_CHECK(
|
7341
|
+
cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7342
|
+
ne01, ne11, ne10,
|
7343
|
+
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
7344
|
+
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
7345
|
+
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
|
7346
|
+
ne12*ne13,
|
7347
|
+
CUBLAS_COMPUTE_16F,
|
7348
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7349
|
+
} else {
|
7350
|
+
// use cublasGemmBatchedEx
|
7351
|
+
const int ne23 = ne12*ne13;
|
7352
|
+
|
7353
|
+
void ** ptrs_as = nullptr;
|
7354
|
+
size_t ptrs_s = 0;
|
7355
|
+
ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
7356
|
+
|
7357
|
+
dim3 block_dims(ne13, ne12);
|
7358
|
+
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
7359
|
+
src0_as_f16, src1_as_f16, dst_f16,
|
7360
|
+
ptrs_as,
|
7361
|
+
ne12, ne13,
|
7362
|
+
ne23,
|
7363
|
+
nb02, nb03,
|
7364
|
+
nb12, nb13,
|
7365
|
+
dst->nb[2], dst->nb[3],
|
7366
|
+
r2, r3);
|
7367
|
+
CUDA_CHECK(cudaGetLastError());
|
7368
|
+
|
7369
|
+
CUBLAS_CHECK(
|
7370
|
+
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
7371
|
+
ne01, ne11, ne10,
|
7372
|
+
&alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
7373
|
+
(const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
7374
|
+
&beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
7375
|
+
ne23,
|
7376
|
+
CUBLAS_COMPUTE_16F,
|
7377
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
7378
|
+
|
7379
|
+
ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
7380
|
+
}
|
7381
|
+
#endif
|
7382
|
+
|
7383
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
7384
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
7385
|
+
|
7386
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
7387
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
7388
|
+
}
|
7389
|
+
|
7049
7390
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7050
|
-
bool all_on_device =
|
7051
|
-
|
7391
|
+
const bool all_on_device =
|
7392
|
+
(src0->backend == GGML_BACKEND_GPU) &&
|
7393
|
+
(src1->backend == GGML_BACKEND_GPU) &&
|
7394
|
+
( dst->backend == GGML_BACKEND_GPU);
|
7052
7395
|
|
7053
7396
|
int64_t min_compute_capability = INT_MAX;
|
7054
7397
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
7055
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
7056
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7398
|
+
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
7057
7399
|
min_compute_capability = g_compute_capabilities[id];
|
7058
7400
|
}
|
7059
7401
|
}
|
7060
7402
|
|
7061
|
-
|
7403
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
7404
|
+
const bool use_tensor_cores = true;
|
7405
|
+
#else
|
7406
|
+
const bool use_tensor_cores = false;
|
7407
|
+
#endif
|
7408
|
+
|
7409
|
+
// debug helpers
|
7410
|
+
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
7411
|
+
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
7412
|
+
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
7413
|
+
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
7414
|
+
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
7415
|
+
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
7416
|
+
|
7417
|
+
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7418
|
+
// KQ single-batch
|
7062
7419
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7063
|
-
} else if (all_on_device && !ggml_is_contiguous(src0) &&
|
7420
|
+
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
7421
|
+
// KQV single-batch
|
7064
7422
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7423
|
+
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
7424
|
+
// KQ + KQV multi-batch
|
7425
|
+
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
7065
7426
|
} else if (src0->type == GGML_TYPE_F32) {
|
7066
7427
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7067
7428
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
7068
7429
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
7069
|
-
|
7070
7430
|
#ifdef GGML_CUDA_FORCE_DMMV
|
7071
7431
|
const bool use_mul_mat_vec_q = false;
|
7072
7432
|
#else
|
@@ -7079,7 +7439,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7079
7439
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
7080
7440
|
}
|
7081
7441
|
} else {
|
7082
|
-
|
7442
|
+
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
7443
|
+
|
7444
|
+
// when tensor cores are available, use them for large batch size
|
7445
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
7446
|
+
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
7447
|
+
use_mul_mat_q = false;
|
7448
|
+
}
|
7449
|
+
|
7450
|
+
if (use_mul_mat_q) {
|
7083
7451
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
7084
7452
|
} else {
|
7085
7453
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
@@ -7433,10 +7801,6 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
7433
7801
|
}
|
7434
7802
|
}
|
7435
7803
|
|
7436
|
-
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
7437
|
-
g_mul_mat_q = mul_mat_q;
|
7438
|
-
}
|
7439
|
-
|
7440
7804
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7441
7805
|
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
7442
7806
|
// it still won't always work as expected, but it's better than nothing
|