llama_cpp 0.5.3 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +583 -262
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +326 -149
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +167 -89
- data/ext/llama_cpp/src/ggml-metal.metal +130 -40
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +2355 -1166
- data/ext/llama_cpp/src/ggml.h +129 -35
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +1766 -671
- data/ext/llama_cpp/src/llama.h +321 -120
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -10
- data/sig/llama_cpp.rbs +70 -34
- metadata +4 -3
@@ -1,3 +1,4 @@
|
|
1
|
+
#include <algorithm>
|
1
2
|
#include <cstddef>
|
2
3
|
#include <cstdint>
|
3
4
|
#include <limits>
|
@@ -14,9 +15,11 @@
|
|
14
15
|
// for rocblas_initialize()
|
15
16
|
#include "rocblas/rocblas.h"
|
16
17
|
#endif // __HIP_PLATFORM_AMD__
|
18
|
+
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
17
19
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
20
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
21
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
22
|
+
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
20
23
|
#define CUBLAS_OP_N HIPBLAS_OP_N
|
21
24
|
#define CUBLAS_OP_T HIPBLAS_OP_T
|
22
25
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
@@ -77,9 +80,9 @@
|
|
77
80
|
#include "ggml.h"
|
78
81
|
|
79
82
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
80
|
-
#define
|
83
|
+
#define CC_VOLTA 700
|
81
84
|
#define CC_OFFSET_AMD 1000000
|
82
|
-
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
85
|
+
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
83
86
|
|
84
87
|
#if defined(GGML_USE_HIPBLAS)
|
85
88
|
#define __CUDA_ARCH__ 1300
|
@@ -235,8 +238,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
|
|
235
238
|
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
236
239
|
}
|
237
240
|
|
241
|
+
template<typename T>
|
242
|
+
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
|
243
|
+
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
244
|
+
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
245
|
+
|
238
246
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
239
|
-
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
240
247
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
241
248
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
242
249
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
@@ -461,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
|
461
468
|
static bool g_mul_mat_q = true;
|
462
469
|
|
463
470
|
static void * g_scratch_buffer = nullptr;
|
464
|
-
static size_t g_scratch_size =
|
471
|
+
static size_t g_scratch_size = 0; // disabled by default
|
465
472
|
static size_t g_scratch_offset = 0;
|
466
473
|
|
467
474
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -708,7 +715,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
708
715
|
|
709
716
|
//================================== k-quants
|
710
717
|
|
711
|
-
|
718
|
+
template<typename dst_t>
|
719
|
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
712
720
|
|
713
721
|
const int i = blockIdx.x;
|
714
722
|
const block_q2_K * x = (const block_q2_K *) vx;
|
@@ -720,7 +728,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
720
728
|
const int is = 8*n + l/16;
|
721
729
|
|
722
730
|
const uint8_t q = x[i].qs[32*n + l];
|
723
|
-
|
731
|
+
dst_t * y = yy + i*QK_K + 128*n;
|
724
732
|
|
725
733
|
float dall = __low2half(x[i].dm);
|
726
734
|
float dmin = __high2half(x[i].dm);
|
@@ -732,7 +740,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
732
740
|
const int is = tid/16; // 0 or 1
|
733
741
|
const int il = tid%16; // 0...15
|
734
742
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
735
|
-
|
743
|
+
dst_t * y = yy + i*QK_K + 16*is + il;
|
736
744
|
float dall = __low2half(x[i].dm);
|
737
745
|
float dmin = __high2half(x[i].dm);
|
738
746
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
@@ -741,7 +749,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
741
749
|
|
742
750
|
}
|
743
751
|
|
744
|
-
|
752
|
+
template<typename dst_t>
|
753
|
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
745
754
|
|
746
755
|
const int i = blockIdx.x;
|
747
756
|
const block_q3_K * x = (const block_q3_K *) vx;
|
@@ -765,7 +774,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
|
|
765
774
|
float d_all = x[i].d;
|
766
775
|
float dl = d_all * (us - 32);
|
767
776
|
|
768
|
-
|
777
|
+
dst_t * y = yy + i*QK_K + 128*n + 32*j;
|
769
778
|
const uint8_t * q = x[i].qs + 32*n;
|
770
779
|
const uint8_t * hm = x[i].hmask;
|
771
780
|
|
@@ -777,7 +786,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
|
|
777
786
|
const int im = il/8; // 0...1
|
778
787
|
const int in = il%8; // 0...7
|
779
788
|
|
780
|
-
|
789
|
+
dst_t * y = yy + i*QK_K + 16*is + il;
|
781
790
|
|
782
791
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
783
792
|
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
@@ -805,7 +814,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
805
814
|
}
|
806
815
|
#endif
|
807
816
|
|
808
|
-
|
817
|
+
template<typename dst_t>
|
818
|
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
809
819
|
const block_q4_K * x = (const block_q4_K *) vx;
|
810
820
|
|
811
821
|
const int i = blockIdx.x;
|
@@ -818,7 +828,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
818
828
|
const int is = 2*il;
|
819
829
|
const int n = 4;
|
820
830
|
|
821
|
-
|
831
|
+
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
822
832
|
|
823
833
|
const float dall = __low2half(x[i].dm);
|
824
834
|
const float dmin = __high2half(x[i].dm);
|
@@ -837,7 +847,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
837
847
|
#else
|
838
848
|
const int tid = threadIdx.x;
|
839
849
|
const uint8_t * q = x[i].qs;
|
840
|
-
|
850
|
+
dst_t * y = yy + i*QK_K;
|
841
851
|
const float d = (float)x[i].dm[0];
|
842
852
|
const float m = (float)x[i].dm[1];
|
843
853
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
@@ -845,7 +855,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
845
855
|
#endif
|
846
856
|
}
|
847
857
|
|
848
|
-
|
858
|
+
template<typename dst_t>
|
859
|
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
849
860
|
const block_q5_K * x = (const block_q5_K *) vx;
|
850
861
|
|
851
862
|
const int i = blockIdx.x;
|
@@ -857,7 +868,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
857
868
|
const int ir = tid%16; // ir is in 0...15
|
858
869
|
const int is = 2*il; // is is in 0...6
|
859
870
|
|
860
|
-
|
871
|
+
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
|
861
872
|
|
862
873
|
const float dall = __low2half(x[i].dm);
|
863
874
|
const float dmin = __high2half(x[i].dm);
|
@@ -885,13 +896,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
885
896
|
const int is = tid/16; // 0 or 1
|
886
897
|
const uint8_t h = x[i].qh[in] >> im;
|
887
898
|
const float d = x[i].d;
|
888
|
-
|
899
|
+
dst_t * y = yy + i*QK_K + tid;
|
889
900
|
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
890
901
|
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
891
902
|
#endif
|
892
903
|
}
|
893
904
|
|
894
|
-
|
905
|
+
template<typename dst_t>
|
906
|
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
895
907
|
const block_q6_K * x = (const block_q6_K *) vx;
|
896
908
|
|
897
909
|
const int i = blockIdx.x;
|
@@ -903,7 +915,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
|
|
903
915
|
const int il = tid - 32*ip; // 0...32
|
904
916
|
const int is = 8*ip + il/16;
|
905
917
|
|
906
|
-
|
918
|
+
dst_t * y = yy + i*QK_K + 128*ip + il;
|
907
919
|
|
908
920
|
const float d = x[i].d;
|
909
921
|
|
@@ -922,7 +934,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
|
|
922
934
|
const int ip = tid/16; // 0 or 1
|
923
935
|
const int il = tid - 16*ip; // 0...15
|
924
936
|
|
925
|
-
|
937
|
+
dst_t * y = yy + i*QK_K + 16*ip + il;
|
926
938
|
|
927
939
|
const float d = x[i].d;
|
928
940
|
|
@@ -1515,6 +1527,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1515
1527
|
v.y = x[ib + iqs + 1];
|
1516
1528
|
}
|
1517
1529
|
|
1530
|
+
static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
1531
|
+
const float * x = (const float *) vx;
|
1532
|
+
|
1533
|
+
// automatic half -> float type cast if dfloat == float
|
1534
|
+
v.x = x[ib + iqs + 0];
|
1535
|
+
v.y = x[ib + iqs + 1];
|
1536
|
+
}
|
1537
|
+
|
1518
1538
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
1519
1539
|
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
1520
1540
|
|
@@ -1554,8 +1574,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1554
1574
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1555
1575
|
}
|
1556
1576
|
|
1557
|
-
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1558
|
-
static __global__ void dequantize_block(const void * __restrict__ vx,
|
1577
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1578
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1559
1579
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
1560
1580
|
|
1561
1581
|
if (i >= k) {
|
@@ -3533,7 +3553,7 @@ template <bool need_check> static __global__ void
|
|
3533
3553
|
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3534
3554
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3535
3555
|
|
3536
|
-
#elif __CUDA_ARCH__ >=
|
3556
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3537
3557
|
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3538
3558
|
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3539
3559
|
const int nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -3553,7 +3573,7 @@ template <bool need_check> static __global__ void
|
|
3553
3573
|
#else
|
3554
3574
|
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3555
3575
|
assert(false);
|
3556
|
-
#endif // __CUDA_ARCH__ >=
|
3576
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3557
3577
|
}
|
3558
3578
|
|
3559
3579
|
#define MMQ_X_Q4_1_RDNA2 64
|
@@ -3574,9 +3594,9 @@ template <bool need_check> static __global__ void
|
|
3574
3594
|
#if defined(RDNA3) || defined(RDNA2)
|
3575
3595
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
3576
3596
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3577
|
-
#elif __CUDA_ARCH__ <
|
3597
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3578
3598
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3579
|
-
#endif // __CUDA_ARCH__ <
|
3599
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3580
3600
|
mul_mat_q4_1(
|
3581
3601
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3582
3602
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3596,7 +3616,7 @@ template <bool need_check> static __global__ void
|
|
3596
3616
|
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3597
3617
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3598
3618
|
|
3599
|
-
#elif __CUDA_ARCH__ >=
|
3619
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3600
3620
|
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3601
3621
|
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3602
3622
|
const int nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -3616,7 +3636,7 @@ template <bool need_check> static __global__ void
|
|
3616
3636
|
#else
|
3617
3637
|
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3618
3638
|
assert(false);
|
3619
|
-
#endif // __CUDA_ARCH__ >=
|
3639
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3620
3640
|
}
|
3621
3641
|
|
3622
3642
|
#define MMQ_X_Q5_0_RDNA2 64
|
@@ -3657,7 +3677,7 @@ template <bool need_check> static __global__ void
|
|
3657
3677
|
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3658
3678
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3659
3679
|
|
3660
|
-
#elif __CUDA_ARCH__ >=
|
3680
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3661
3681
|
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3662
3682
|
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3663
3683
|
const int nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -3677,7 +3697,7 @@ template <bool need_check> static __global__ void
|
|
3677
3697
|
#else
|
3678
3698
|
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3679
3699
|
assert(false);
|
3680
|
-
#endif // __CUDA_ARCH__ >=
|
3700
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3681
3701
|
}
|
3682
3702
|
|
3683
3703
|
#define MMQ_X_Q5_1_RDNA2 64
|
@@ -3718,7 +3738,7 @@ mul_mat_q5_1(
|
|
3718
3738
|
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3719
3739
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3720
3740
|
|
3721
|
-
#elif __CUDA_ARCH__ >=
|
3741
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3722
3742
|
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3723
3743
|
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3724
3744
|
const int nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -3738,7 +3758,7 @@ mul_mat_q5_1(
|
|
3738
3758
|
#else
|
3739
3759
|
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3740
3760
|
assert(false);
|
3741
|
-
#endif // __CUDA_ARCH__ >=
|
3761
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3742
3762
|
}
|
3743
3763
|
|
3744
3764
|
#define MMQ_X_Q8_0_RDNA2 64
|
@@ -3779,7 +3799,7 @@ template <bool need_check> static __global__ void
|
|
3779
3799
|
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3780
3800
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3781
3801
|
|
3782
|
-
#elif __CUDA_ARCH__ >=
|
3802
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3783
3803
|
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3784
3804
|
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3785
3805
|
const int nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -3799,7 +3819,7 @@ template <bool need_check> static __global__ void
|
|
3799
3819
|
#else
|
3800
3820
|
(void) vec_dot_q8_0_q8_1_mul_mat;
|
3801
3821
|
assert(false);
|
3802
|
-
#endif // __CUDA_ARCH__ >=
|
3822
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3803
3823
|
}
|
3804
3824
|
|
3805
3825
|
#define MMQ_X_Q2_K_RDNA2 64
|
@@ -3840,7 +3860,7 @@ mul_mat_q2_K(
|
|
3840
3860
|
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3841
3861
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3842
3862
|
|
3843
|
-
#elif __CUDA_ARCH__ >=
|
3863
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3844
3864
|
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3845
3865
|
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3846
3866
|
const int nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -3860,7 +3880,7 @@ mul_mat_q2_K(
|
|
3860
3880
|
#else
|
3861
3881
|
(void) vec_dot_q2_K_q8_1_mul_mat;
|
3862
3882
|
assert(false);
|
3863
|
-
#endif // __CUDA_ARCH__ >=
|
3883
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3864
3884
|
}
|
3865
3885
|
|
3866
3886
|
#define MMQ_X_Q3_K_RDNA2 128
|
@@ -3881,9 +3901,9 @@ template <bool need_check> static __global__ void
|
|
3881
3901
|
#if defined(RDNA3) || defined(RDNA2)
|
3882
3902
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
3883
3903
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3884
|
-
#elif __CUDA_ARCH__ <
|
3904
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3885
3905
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3886
|
-
#endif // __CUDA_ARCH__ <
|
3906
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3887
3907
|
mul_mat_q3_K(
|
3888
3908
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3889
3909
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3903,7 +3923,7 @@ template <bool need_check> static __global__ void
|
|
3903
3923
|
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3904
3924
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3905
3925
|
|
3906
|
-
#elif __CUDA_ARCH__ >=
|
3926
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3907
3927
|
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3908
3928
|
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3909
3929
|
const int nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -3923,7 +3943,7 @@ template <bool need_check> static __global__ void
|
|
3923
3943
|
#else
|
3924
3944
|
(void) vec_dot_q3_K_q8_1_mul_mat;
|
3925
3945
|
assert(false);
|
3926
|
-
#endif // __CUDA_ARCH__ >=
|
3946
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3927
3947
|
}
|
3928
3948
|
|
3929
3949
|
#define MMQ_X_Q4_K_RDNA2 64
|
@@ -3944,9 +3964,9 @@ template <bool need_check> static __global__ void
|
|
3944
3964
|
#if defined(RDNA3) || defined(RDNA2)
|
3945
3965
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
3946
3966
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3947
|
-
#elif __CUDA_ARCH__ <
|
3967
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3948
3968
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3949
|
-
#endif // __CUDA_ARCH__ <
|
3969
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3950
3970
|
mul_mat_q4_K(
|
3951
3971
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3952
3972
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3966,7 +3986,7 @@ template <bool need_check> static __global__ void
|
|
3966
3986
|
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3967
3987
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3968
3988
|
|
3969
|
-
#elif __CUDA_ARCH__ >=
|
3989
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3970
3990
|
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3971
3991
|
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3972
3992
|
const int nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -3986,7 +4006,7 @@ template <bool need_check> static __global__ void
|
|
3986
4006
|
#else
|
3987
4007
|
(void) vec_dot_q4_K_q8_1_mul_mat;
|
3988
4008
|
assert(false);
|
3989
|
-
#endif // __CUDA_ARCH__ >=
|
4009
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3990
4010
|
}
|
3991
4011
|
|
3992
4012
|
#define MMQ_X_Q5_K_RDNA2 64
|
@@ -4027,7 +4047,7 @@ mul_mat_q5_K(
|
|
4027
4047
|
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4028
4048
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4029
4049
|
|
4030
|
-
#elif __CUDA_ARCH__ >=
|
4050
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
4031
4051
|
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
4032
4052
|
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4033
4053
|
const int nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -4047,7 +4067,7 @@ mul_mat_q5_K(
|
|
4047
4067
|
#else
|
4048
4068
|
(void) vec_dot_q5_K_q8_1_mul_mat;
|
4049
4069
|
assert(false);
|
4050
|
-
#endif // __CUDA_ARCH__ >=
|
4070
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4051
4071
|
}
|
4052
4072
|
|
4053
4073
|
#define MMQ_X_Q6_K_RDNA2 64
|
@@ -4068,9 +4088,9 @@ template <bool need_check> static __global__ void
|
|
4068
4088
|
#if defined(RDNA3) || defined(RDNA2)
|
4069
4089
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
4070
4090
|
#endif // defined(RDNA3) || defined(RDNA2)
|
4071
|
-
#elif __CUDA_ARCH__ <
|
4091
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
4072
4092
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
4073
|
-
#endif // __CUDA_ARCH__ <
|
4093
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
4074
4094
|
mul_mat_q6_K(
|
4075
4095
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
4076
4096
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -4090,7 +4110,7 @@ template <bool need_check> static __global__ void
|
|
4090
4110
|
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4091
4111
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4092
4112
|
|
4093
|
-
#elif __CUDA_ARCH__ >=
|
4113
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
4094
4114
|
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
4095
4115
|
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4096
4116
|
const int nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4110,7 +4130,7 @@ template <bool need_check> static __global__ void
|
|
4110
4130
|
#else
|
4111
4131
|
(void) vec_dot_q6_K_q8_1_mul_mat;
|
4112
4132
|
assert(false);
|
4113
|
-
#endif // __CUDA_ARCH__ >=
|
4133
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4114
4134
|
}
|
4115
4135
|
|
4116
4136
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
@@ -4355,8 +4375,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4355
4375
|
}
|
4356
4376
|
|
4357
4377
|
// rope == RoPE == rotary positional embedding
|
4358
|
-
|
4359
|
-
|
4378
|
+
|
4379
|
+
template<typename T, bool has_pos>
|
4380
|
+
static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4381
|
+
const int p_delta_rows, const float theta_scale) {
|
4360
4382
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4361
4383
|
|
4362
4384
|
if (col >= ncols) {
|
@@ -4365,8 +4387,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
4365
4387
|
|
4366
4388
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4367
4389
|
const int i = row*ncols + col;
|
4390
|
+
const int i2 = row/p_delta_rows;
|
4368
4391
|
|
4369
|
-
const
|
4392
|
+
const int p = has_pos ? pos[i2] : 0;
|
4393
|
+
const float p0 = p*freq_scale;
|
4394
|
+
const float theta = p0*powf(theta_scale, col/2);
|
4370
4395
|
const float sin_theta = sinf(theta);
|
4371
4396
|
const float cos_theta = cosf(theta);
|
4372
4397
|
|
@@ -4377,8 +4402,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
4377
4402
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
4378
4403
|
}
|
4379
4404
|
|
4380
|
-
|
4381
|
-
|
4405
|
+
template<typename T, bool has_pos>
|
4406
|
+
static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4407
|
+
const int p_delta_rows, const float theta_scale) {
|
4382
4408
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4383
4409
|
|
4384
4410
|
if (col >= ncols) {
|
@@ -4387,8 +4413,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4387
4413
|
|
4388
4414
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4389
4415
|
const int i = row*ncols + col/2;
|
4416
|
+
const int i2 = row/p_delta_rows;
|
4390
4417
|
|
4391
|
-
const
|
4418
|
+
const int p = has_pos ? pos[i2] : 0;
|
4419
|
+
const float p0 = p*freq_scale;
|
4420
|
+
const float theta = p0*powf(theta_scale, col/2);
|
4392
4421
|
const float sin_theta = sinf(theta);
|
4393
4422
|
const float cos_theta = cosf(theta);
|
4394
4423
|
|
@@ -4399,8 +4428,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4399
4428
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4400
4429
|
}
|
4401
4430
|
|
4402
|
-
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float
|
4403
|
-
const
|
4431
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4432
|
+
const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
4404
4433
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4405
4434
|
const int half_n_dims = ncols/4;
|
4406
4435
|
|
@@ -4410,11 +4439,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4410
4439
|
|
4411
4440
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
4412
4441
|
const int i = row*ncols + col;
|
4442
|
+
const int i2 = row/p_delta_rows;
|
4413
4443
|
|
4414
4444
|
const float col_theta_scale = powf(theta_scale, col);
|
4415
|
-
|
4445
|
+
// FIXME: this is likely wrong
|
4446
|
+
const int p = pos != nullptr ? pos[i2] : 0;
|
4416
4447
|
|
4417
|
-
const float theta = min(p,
|
4448
|
+
const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
|
4418
4449
|
const float sin_theta = sinf(theta);
|
4419
4450
|
const float cos_theta = cosf(theta);
|
4420
4451
|
|
@@ -4424,7 +4455,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4424
4455
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4425
4456
|
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
4426
4457
|
|
4427
|
-
const float block_theta = max(p -
|
4458
|
+
const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
|
4428
4459
|
const float sin_block_theta = sinf(block_theta);
|
4429
4460
|
const float cos_block_theta = cosf(block_theta);
|
4430
4461
|
|
@@ -4578,32 +4609,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
4578
4609
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
4579
4610
|
}
|
4580
4611
|
|
4581
|
-
|
4612
|
+
template<typename dst_t>
|
4613
|
+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4582
4614
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4583
4615
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4584
4616
|
}
|
4585
4617
|
|
4586
|
-
|
4618
|
+
template<typename dst_t>
|
4619
|
+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4587
4620
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4588
4621
|
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4589
4622
|
}
|
4590
4623
|
|
4591
|
-
|
4624
|
+
template<typename dst_t>
|
4625
|
+
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4592
4626
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4593
4627
|
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4594
4628
|
}
|
4595
4629
|
|
4596
|
-
|
4630
|
+
template<typename dst_t>
|
4631
|
+
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4597
4632
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4598
4633
|
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4599
4634
|
}
|
4600
4635
|
|
4601
|
-
|
4636
|
+
template<typename dst_t>
|
4637
|
+
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4602
4638
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4603
4639
|
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4604
4640
|
}
|
4605
4641
|
|
4606
|
-
|
4642
|
+
template<typename dst_t>
|
4643
|
+
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4607
4644
|
const int nb = k / QK_K;
|
4608
4645
|
#if QK_K == 256
|
4609
4646
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4612,7 +4649,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu
|
|
4612
4649
|
#endif
|
4613
4650
|
}
|
4614
4651
|
|
4615
|
-
|
4652
|
+
template<typename dst_t>
|
4653
|
+
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4616
4654
|
const int nb = k / QK_K;
|
4617
4655
|
#if QK_K == 256
|
4618
4656
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4621,12 +4659,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu
|
|
4621
4659
|
#endif
|
4622
4660
|
}
|
4623
4661
|
|
4624
|
-
|
4662
|
+
template<typename dst_t>
|
4663
|
+
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4625
4664
|
const int nb = k / QK_K;
|
4626
4665
|
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
4627
4666
|
}
|
4628
4667
|
|
4629
|
-
|
4668
|
+
template<typename dst_t>
|
4669
|
+
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4630
4670
|
const int nb = k / QK_K;
|
4631
4671
|
#if QK_K == 256
|
4632
4672
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4635,7 +4675,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu
|
|
4635
4675
|
#endif
|
4636
4676
|
}
|
4637
4677
|
|
4638
|
-
|
4678
|
+
template<typename dst_t>
|
4679
|
+
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4639
4680
|
const int nb = k / QK_K;
|
4640
4681
|
#if QK_K == 256
|
4641
4682
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4826,6 +4867,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
4826
4867
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4827
4868
|
}
|
4828
4869
|
|
4870
|
+
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
|
4871
|
+
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
4872
|
+
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4873
|
+
}
|
4874
|
+
|
4829
4875
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4830
4876
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4831
4877
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -4835,6 +4881,35 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
|
|
4835
4881
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4836
4882
|
}
|
4837
4883
|
|
4884
|
+
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
4885
|
+
switch (type) {
|
4886
|
+
case GGML_TYPE_Q4_0:
|
4887
|
+
return dequantize_row_q4_0_cuda;
|
4888
|
+
case GGML_TYPE_Q4_1:
|
4889
|
+
return dequantize_row_q4_1_cuda;
|
4890
|
+
case GGML_TYPE_Q5_0:
|
4891
|
+
return dequantize_row_q5_0_cuda;
|
4892
|
+
case GGML_TYPE_Q5_1:
|
4893
|
+
return dequantize_row_q5_1_cuda;
|
4894
|
+
case GGML_TYPE_Q8_0:
|
4895
|
+
return dequantize_row_q8_0_cuda;
|
4896
|
+
case GGML_TYPE_Q2_K:
|
4897
|
+
return dequantize_row_q2_K_cuda;
|
4898
|
+
case GGML_TYPE_Q3_K:
|
4899
|
+
return dequantize_row_q3_K_cuda;
|
4900
|
+
case GGML_TYPE_Q4_K:
|
4901
|
+
return dequantize_row_q4_K_cuda;
|
4902
|
+
case GGML_TYPE_Q5_K:
|
4903
|
+
return dequantize_row_q5_K_cuda;
|
4904
|
+
case GGML_TYPE_Q6_K:
|
4905
|
+
return dequantize_row_q6_K_cuda;
|
4906
|
+
case GGML_TYPE_F32:
|
4907
|
+
return convert_fp32_to_fp16_cuda;
|
4908
|
+
default:
|
4909
|
+
return nullptr;
|
4910
|
+
}
|
4911
|
+
}
|
4912
|
+
|
4838
4913
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
4839
4914
|
switch (type) {
|
4840
4915
|
case GGML_TYPE_Q4_0:
|
@@ -4881,7 +4956,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
4881
4956
|
mmq_x = MMQ_X_Q4_0_RDNA1;
|
4882
4957
|
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
4883
4958
|
nwarps = NWARPS_Q4_0_RDNA1;
|
4884
|
-
} else if (compute_capability >=
|
4959
|
+
} else if (compute_capability >= CC_VOLTA) {
|
4885
4960
|
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4886
4961
|
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4887
4962
|
nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -4926,7 +5001,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
4926
5001
|
mmq_x = MMQ_X_Q4_1_RDNA1;
|
4927
5002
|
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
4928
5003
|
nwarps = NWARPS_Q4_1_RDNA1;
|
4929
|
-
} else if (compute_capability >=
|
5004
|
+
} else if (compute_capability >= CC_VOLTA) {
|
4930
5005
|
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4931
5006
|
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4932
5007
|
nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -4971,7 +5046,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4971
5046
|
mmq_x = MMQ_X_Q5_0_RDNA1;
|
4972
5047
|
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
4973
5048
|
nwarps = NWARPS_Q5_0_RDNA1;
|
4974
|
-
} else if (compute_capability >=
|
5049
|
+
} else if (compute_capability >= CC_VOLTA) {
|
4975
5050
|
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4976
5051
|
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4977
5052
|
nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -5016,7 +5091,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
5016
5091
|
mmq_x = MMQ_X_Q5_1_RDNA1;
|
5017
5092
|
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
5018
5093
|
nwarps = NWARPS_Q5_1_RDNA1;
|
5019
|
-
} else if (compute_capability >=
|
5094
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5020
5095
|
mmq_x = MMQ_X_Q5_1_AMPERE;
|
5021
5096
|
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
5022
5097
|
nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -5061,7 +5136,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
5061
5136
|
mmq_x = MMQ_X_Q8_0_RDNA1;
|
5062
5137
|
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
5063
5138
|
nwarps = NWARPS_Q8_0_RDNA1;
|
5064
|
-
} else if (compute_capability >=
|
5139
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5065
5140
|
mmq_x = MMQ_X_Q8_0_AMPERE;
|
5066
5141
|
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
5067
5142
|
nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -5106,7 +5181,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
5106
5181
|
mmq_x = MMQ_X_Q2_K_RDNA1;
|
5107
5182
|
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
5108
5183
|
nwarps = NWARPS_Q2_K_RDNA1;
|
5109
|
-
} else if (compute_capability >=
|
5184
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5110
5185
|
mmq_x = MMQ_X_Q2_K_AMPERE;
|
5111
5186
|
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
5112
5187
|
nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -5153,7 +5228,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
5153
5228
|
mmq_x = MMQ_X_Q3_K_RDNA1;
|
5154
5229
|
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
5155
5230
|
nwarps = NWARPS_Q3_K_RDNA1;
|
5156
|
-
} else if (compute_capability >=
|
5231
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5157
5232
|
mmq_x = MMQ_X_Q3_K_AMPERE;
|
5158
5233
|
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
5159
5234
|
nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -5199,7 +5274,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
5199
5274
|
mmq_x = MMQ_X_Q4_K_RDNA1;
|
5200
5275
|
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
5201
5276
|
nwarps = NWARPS_Q4_K_RDNA1;
|
5202
|
-
} else if (compute_capability >=
|
5277
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5203
5278
|
mmq_x = MMQ_X_Q4_K_AMPERE;
|
5204
5279
|
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
5205
5280
|
nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -5244,7 +5319,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
5244
5319
|
mmq_x = MMQ_X_Q5_K_RDNA1;
|
5245
5320
|
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
5246
5321
|
nwarps = NWARPS_Q5_K_RDNA1;
|
5247
|
-
} else if (compute_capability >=
|
5322
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5248
5323
|
mmq_x = MMQ_X_Q5_K_AMPERE;
|
5249
5324
|
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
5250
5325
|
nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -5289,7 +5364,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
5289
5364
|
mmq_x = MMQ_X_Q6_K_RDNA1;
|
5290
5365
|
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
5291
5366
|
nwarps = NWARPS_Q6_K_RDNA1;
|
5292
|
-
} else if (compute_capability >=
|
5367
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5293
5368
|
mmq_x = MMQ_X_Q6_K_AMPERE;
|
5294
5369
|
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
5295
5370
|
nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -5361,31 +5436,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
5361
5436
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5362
5437
|
}
|
5363
5438
|
|
5364
|
-
|
5365
|
-
|
5439
|
+
template<typename T>
|
5440
|
+
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5441
|
+
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
5366
5442
|
GGML_ASSERT(ncols % 2 == 0);
|
5367
5443
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5368
5444
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5369
5445
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5370
|
-
|
5446
|
+
if (pos == nullptr) {
|
5447
|
+
rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5448
|
+
} else {
|
5449
|
+
rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5450
|
+
}
|
5371
5451
|
}
|
5372
5452
|
|
5373
|
-
|
5374
|
-
|
5453
|
+
template<typename T>
|
5454
|
+
static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5455
|
+
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
5375
5456
|
GGML_ASSERT(ncols % 2 == 0);
|
5376
5457
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5377
5458
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5378
5459
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5379
|
-
|
5460
|
+
if (pos == nullptr) {
|
5461
|
+
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5462
|
+
} else {
|
5463
|
+
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5464
|
+
}
|
5380
5465
|
}
|
5381
5466
|
|
5382
|
-
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
5383
|
-
const
|
5467
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5468
|
+
const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
5384
5469
|
GGML_ASSERT(ncols % 4 == 0);
|
5385
5470
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5386
5471
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
5387
5472
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
5388
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
5473
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
|
5389
5474
|
}
|
5390
5475
|
|
5391
5476
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5857,7 +5942,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5857
5942
|
switch(type) {
|
5858
5943
|
case GGML_TYPE_Q4_0:
|
5859
5944
|
case GGML_TYPE_Q4_1:
|
5860
|
-
return max_compute_capability >=
|
5945
|
+
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
5861
5946
|
case GGML_TYPE_Q5_0:
|
5862
5947
|
case GGML_TYPE_Q5_1:
|
5863
5948
|
case GGML_TYPE_Q8_0:
|
@@ -5868,7 +5953,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5868
5953
|
case GGML_TYPE_Q3_K:
|
5869
5954
|
case GGML_TYPE_Q4_K:
|
5870
5955
|
case GGML_TYPE_Q5_K:
|
5871
|
-
return max_compute_capability >=
|
5956
|
+
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
5872
5957
|
case GGML_TYPE_Q6_K:
|
5873
5958
|
return 64;
|
5874
5959
|
default:
|
@@ -6016,8 +6101,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6016
6101
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
6017
6102
|
GGML_ASSERT(dst_dd_i != nullptr);
|
6018
6103
|
|
6019
|
-
const float alpha = 1.0f;
|
6020
|
-
const float beta = 0.0f;
|
6021
6104
|
|
6022
6105
|
const int64_t ne00 = src0->ne[0];
|
6023
6106
|
|
@@ -6026,16 +6109,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6026
6109
|
const int64_t ne0 = dst->ne[0];
|
6027
6110
|
const int64_t row_diff = row_high - row_low;
|
6028
6111
|
|
6029
|
-
float * src0_ddq_as_f32;
|
6030
|
-
size_t src0_as = 0;
|
6031
|
-
|
6032
|
-
if (src0->type != GGML_TYPE_F32) {
|
6033
|
-
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6034
|
-
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6035
|
-
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6036
|
-
}
|
6037
|
-
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
6038
|
-
|
6039
6112
|
int id;
|
6040
6113
|
CUDA_CHECK(cudaGetDevice(&id));
|
6041
6114
|
|
@@ -6043,16 +6116,87 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6043
6116
|
// ldc == nrows of the matrix that cuBLAS writes into
|
6044
6117
|
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
6045
6118
|
|
6046
|
-
|
6047
|
-
|
6048
|
-
|
6049
|
-
|
6050
|
-
|
6051
|
-
|
6052
|
-
|
6119
|
+
const int compute_capability = g_compute_capabilities[id];
|
6120
|
+
|
6121
|
+
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
6122
|
+
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
6123
|
+
half * src0_as_f16 = nullptr;
|
6124
|
+
size_t src0_as = 0;
|
6125
|
+
if (src0->type != GGML_TYPE_F16) {
|
6126
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
6127
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
6128
|
+
size_t ne = row_diff*ne00;
|
6129
|
+
src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
|
6130
|
+
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
|
6131
|
+
}
|
6132
|
+
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
|
6133
|
+
|
6134
|
+
half * src1_as_f16 = nullptr;
|
6135
|
+
size_t src1_as = 0;
|
6136
|
+
if (src1->type != GGML_TYPE_F16) {
|
6137
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
6138
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
6139
|
+
size_t ne = src1_ncols*ne10;
|
6140
|
+
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
6141
|
+
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
6142
|
+
}
|
6143
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
|
6144
|
+
|
6145
|
+
size_t dst_as = 0;
|
6146
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
6147
|
+
|
6148
|
+
const half alpha_f16 = 1.0f;
|
6149
|
+
const half beta_f16 = 0.0f;
|
6150
|
+
|
6151
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
6152
|
+
CUBLAS_CHECK(
|
6153
|
+
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6154
|
+
row_diff, src1_ncols, ne10,
|
6155
|
+
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
6156
|
+
src1_ptr, CUDA_R_16F, ne10,
|
6157
|
+
&beta_f16, dst_f16, CUDA_R_16F, ldc,
|
6158
|
+
CUBLAS_COMPUTE_16F,
|
6159
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
6160
|
+
|
6161
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
6162
|
+
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
|
6163
|
+
|
6164
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
6165
|
+
|
6166
|
+
if (src0_as != 0) {
|
6167
|
+
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
6168
|
+
}
|
6053
6169
|
|
6054
|
-
|
6055
|
-
|
6170
|
+
if (src1_as != 0) {
|
6171
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
6172
|
+
}
|
6173
|
+
}
|
6174
|
+
else {
|
6175
|
+
float * src0_ddq_as_f32 = nullptr;
|
6176
|
+
size_t src0_as = 0;
|
6177
|
+
|
6178
|
+
if (src0->type != GGML_TYPE_F32) {
|
6179
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6180
|
+
GGML_ASSERT(to_fp32_cuda != nullptr);
|
6181
|
+
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6182
|
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6183
|
+
}
|
6184
|
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
6185
|
+
|
6186
|
+
const float alpha = 1.0f;
|
6187
|
+
const float beta = 0.0f;
|
6188
|
+
|
6189
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
6190
|
+
CUBLAS_CHECK(
|
6191
|
+
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6192
|
+
row_diff, src1_ncols, ne10,
|
6193
|
+
&alpha, src0_ddf_i, ne00,
|
6194
|
+
src1_ddf_i, ne10,
|
6195
|
+
&beta, dst_dd_i, ldc));
|
6196
|
+
|
6197
|
+
if (src0_as != 0) {
|
6198
|
+
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
6199
|
+
}
|
6056
6200
|
}
|
6057
6201
|
|
6058
6202
|
(void) dst;
|
@@ -6064,14 +6208,16 @@ inline void ggml_cuda_op_rope(
|
|
6064
6208
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6065
6209
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6066
6210
|
|
6067
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6068
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6211
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
6212
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
6213
|
+
GGML_ASSERT(src0->type == dst->type);
|
6069
6214
|
|
6070
6215
|
const int64_t ne00 = src0->ne[0];
|
6071
6216
|
const int64_t ne01 = src0->ne[1];
|
6217
|
+
const int64_t ne2 = dst->ne[2];
|
6072
6218
|
const int64_t nrows = ggml_nrows(src0);
|
6073
6219
|
|
6074
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
6220
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6075
6221
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
6076
6222
|
const int mode = ((int32_t *) dst->op_params)[2];
|
6077
6223
|
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
@@ -6082,19 +6228,38 @@ inline void ggml_cuda_op_rope(
|
|
6082
6228
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
6083
6229
|
|
6084
6230
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6085
|
-
|
6231
|
+
|
6232
|
+
const int32_t * pos = nullptr;
|
6233
|
+
if ((mode & 1) == 0) {
|
6234
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
6235
|
+
GGML_ASSERT(src1->ne[0] == ne2);
|
6236
|
+
pos = (const int32_t *) src1_dd;
|
6237
|
+
}
|
6086
6238
|
|
6087
6239
|
const bool is_neox = mode & 2;
|
6088
6240
|
const bool is_glm = mode & 4;
|
6089
6241
|
|
6090
6242
|
// compute
|
6091
6243
|
if (is_glm) {
|
6092
|
-
|
6244
|
+
GGML_ASSERT(false);
|
6245
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
|
6093
6246
|
} else if (is_neox) {
|
6094
6247
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6095
|
-
|
6248
|
+
if (src0->type == GGML_TYPE_F32) {
|
6249
|
+
rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6250
|
+
} else if (src0->type == GGML_TYPE_F16) {
|
6251
|
+
rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6252
|
+
} else {
|
6253
|
+
GGML_ASSERT(false);
|
6254
|
+
}
|
6096
6255
|
} else {
|
6097
|
-
|
6256
|
+
if (src0->type == GGML_TYPE_F32) {
|
6257
|
+
rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6258
|
+
} else if (src0->type == GGML_TYPE_F16) {
|
6259
|
+
rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6260
|
+
} else {
|
6261
|
+
GGML_ASSERT(false);
|
6262
|
+
}
|
6098
6263
|
}
|
6099
6264
|
|
6100
6265
|
(void) src1;
|
@@ -6265,7 +6430,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6265
6430
|
}
|
6266
6431
|
}
|
6267
6432
|
|
6268
|
-
void ggml_cuda_set_peer_access(const int n_tokens) {
|
6433
|
+
static void ggml_cuda_set_peer_access(const int n_tokens) {
|
6269
6434
|
static bool peer_access_enabled = false;
|
6270
6435
|
|
6271
6436
|
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
|
@@ -6593,27 +6758,27 @@ static void ggml_cuda_op_mul_mat(
|
|
6593
6758
|
}
|
6594
6759
|
}
|
6595
6760
|
|
6596
|
-
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6761
|
+
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6597
6762
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6598
6763
|
}
|
6599
6764
|
|
6600
|
-
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6765
|
+
static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6601
6766
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
6602
6767
|
}
|
6603
6768
|
|
6604
|
-
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6769
|
+
static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6605
6770
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
6606
6771
|
}
|
6607
6772
|
|
6608
|
-
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6773
|
+
static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6609
6774
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
6610
6775
|
}
|
6611
6776
|
|
6612
|
-
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6777
|
+
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6613
6778
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
6614
6779
|
}
|
6615
6780
|
|
6616
|
-
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6781
|
+
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6617
6782
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
6618
6783
|
}
|
6619
6784
|
|
@@ -6624,17 +6789,13 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
|
|
6624
6789
|
const int64_t ne1 = dst->ne[1];
|
6625
6790
|
|
6626
6791
|
// TODO: find the optimal values for these
|
6627
|
-
|
6628
|
-
|
6629
|
-
|
6630
|
-
|
6631
|
-
return true;
|
6632
|
-
}
|
6633
|
-
|
6634
|
-
return false;
|
6792
|
+
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
6793
|
+
src1->type == GGML_TYPE_F32 &&
|
6794
|
+
dst->type == GGML_TYPE_F32 &&
|
6795
|
+
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
|
6635
6796
|
}
|
6636
6797
|
|
6637
|
-
void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6798
|
+
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6638
6799
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
6639
6800
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
6640
6801
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
@@ -6663,7 +6824,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6663
6824
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
6664
6825
|
}
|
6665
6826
|
|
6666
|
-
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6827
|
+
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6667
6828
|
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
6668
6829
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
6669
6830
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
@@ -6697,7 +6858,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6697
6858
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
6698
6859
|
}
|
6699
6860
|
|
6700
|
-
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6861
|
+
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6701
6862
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
6702
6863
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
6703
6864
|
|
@@ -6741,11 +6902,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
6741
6902
|
}
|
6742
6903
|
}
|
6743
6904
|
|
6744
|
-
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6905
|
+
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6745
6906
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6746
6907
|
}
|
6747
6908
|
|
6748
|
-
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6909
|
+
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6749
6910
|
const int64_t ne = ggml_nelements(src0);
|
6750
6911
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
6751
6912
|
|
@@ -6787,35 +6948,37 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6787
6948
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6788
6949
|
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6789
6950
|
} else {
|
6951
|
+
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
6952
|
+
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
6790
6953
|
GGML_ASSERT(false);
|
6791
6954
|
}
|
6792
6955
|
|
6793
6956
|
(void) dst;
|
6794
6957
|
}
|
6795
6958
|
|
6796
|
-
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6959
|
+
static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6797
6960
|
ggml_cuda_cpy(src0, dst, nullptr);
|
6798
6961
|
(void) src1;
|
6799
6962
|
}
|
6800
6963
|
|
6801
|
-
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6964
|
+
static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6802
6965
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
|
6803
6966
|
}
|
6804
6967
|
|
6805
|
-
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6968
|
+
static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6806
6969
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
|
6807
6970
|
}
|
6808
6971
|
|
6809
|
-
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6972
|
+
static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6810
6973
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6811
6974
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
|
6812
6975
|
}
|
6813
6976
|
|
6814
|
-
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6977
|
+
static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6815
6978
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
6816
6979
|
}
|
6817
6980
|
|
6818
|
-
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6981
|
+
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6819
6982
|
(void) src0;
|
6820
6983
|
(void) src1;
|
6821
6984
|
(void) dst;
|
@@ -6938,11 +7101,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
6938
7101
|
return extra;
|
6939
7102
|
}
|
6940
7103
|
|
6941
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
7104
|
+
static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
6942
7105
|
if (scratch && g_scratch_size == 0) {
|
6943
7106
|
return;
|
6944
7107
|
}
|
6945
7108
|
|
7109
|
+
tensor->backend = GGML_BACKEND_GPU;
|
7110
|
+
|
6946
7111
|
// recursively assign CUDA buffers until a compute tensor is found
|
6947
7112
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
6948
7113
|
const ggml_op src0_op = tensor->src[0]->op;
|
@@ -6954,8 +7119,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6954
7119
|
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
6955
7120
|
}
|
6956
7121
|
|
6957
|
-
tensor->backend = GGML_BACKEND_GPU;
|
6958
|
-
|
6959
7122
|
if (scratch && no_alloc) {
|
6960
7123
|
return;
|
6961
7124
|
}
|
@@ -7040,6 +7203,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
7040
7203
|
tensor->extra = extra;
|
7041
7204
|
}
|
7042
7205
|
|
7206
|
+
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
7207
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7208
|
+
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7209
|
+
|
7210
|
+
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7211
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7212
|
+
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7213
|
+
}
|
7214
|
+
|
7043
7215
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
7044
7216
|
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
7045
7217
|
}
|
@@ -7075,7 +7247,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
|
7075
7247
|
}
|
7076
7248
|
|
7077
7249
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7078
|
-
|
7250
|
+
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
7251
|
+
// it still won't always work as expected, but it's better than nothing
|
7252
|
+
if (scratch_size > g_scratch_size) {
|
7253
|
+
ggml_cuda_free_scratch();
|
7254
|
+
}
|
7255
|
+
g_scratch_size = std::max(g_scratch_size, scratch_size);
|
7079
7256
|
}
|
7080
7257
|
|
7081
7258
|
void ggml_cuda_free_scratch() {
|