llama_cpp 0.6.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +622 -150
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +358 -131
- data/ext/llama_cpp/src/ggml-metal.metal +137 -47
- data/ext/llama_cpp/src/ggml-opencl.cpp +136 -68
- data/ext/llama_cpp/src/ggml.c +812 -365
- data/ext/llama_cpp/src/ggml.h +25 -7
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +2387 -421
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +5 -2
@@ -62,6 +62,7 @@
|
|
62
62
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
63
63
|
#define cudaMemcpyKind hipMemcpyKind
|
64
64
|
#define cudaMemset hipMemset
|
65
|
+
#define cudaMemsetAsync hipMemsetAsync
|
65
66
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
66
67
|
#define cudaSetDevice hipSetDevice
|
67
68
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
@@ -80,9 +81,9 @@
|
|
80
81
|
#include "ggml.h"
|
81
82
|
|
82
83
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
83
|
-
#define
|
84
|
+
#define CC_VOLTA 700
|
84
85
|
#define CC_OFFSET_AMD 1000000
|
85
|
-
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
86
|
+
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
86
87
|
|
87
88
|
#if defined(GGML_USE_HIPBLAS)
|
88
89
|
#define __CUDA_ARCH__ 1300
|
@@ -414,11 +415,13 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
414
415
|
#define CUDA_SILU_BLOCK_SIZE 256
|
415
416
|
#define CUDA_CPY_BLOCK_SIZE 32
|
416
417
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
418
|
+
#define CUDA_CLAMP_BLOCK_SIZE 256
|
417
419
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
418
420
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
419
421
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
420
422
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
421
423
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
424
|
+
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
422
425
|
|
423
426
|
// dmmv = dequantize_mul_mat_vec
|
424
427
|
#ifndef GGML_CUDA_DMMV_X
|
@@ -715,7 +718,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
|
715
718
|
|
716
719
|
//================================== k-quants
|
717
720
|
|
718
|
-
|
721
|
+
template<typename dst_t>
|
722
|
+
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
719
723
|
|
720
724
|
const int i = blockIdx.x;
|
721
725
|
const block_q2_K * x = (const block_q2_K *) vx;
|
@@ -727,7 +731,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
727
731
|
const int is = 8*n + l/16;
|
728
732
|
|
729
733
|
const uint8_t q = x[i].qs[32*n + l];
|
730
|
-
|
734
|
+
dst_t * y = yy + i*QK_K + 128*n;
|
731
735
|
|
732
736
|
float dall = __low2half(x[i].dm);
|
733
737
|
float dmin = __high2half(x[i].dm);
|
@@ -739,7 +743,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
739
743
|
const int is = tid/16; // 0 or 1
|
740
744
|
const int il = tid%16; // 0...15
|
741
745
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
742
|
-
|
746
|
+
dst_t * y = yy + i*QK_K + 16*is + il;
|
743
747
|
float dall = __low2half(x[i].dm);
|
744
748
|
float dmin = __high2half(x[i].dm);
|
745
749
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
@@ -748,7 +752,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
748
752
|
|
749
753
|
}
|
750
754
|
|
751
|
-
|
755
|
+
template<typename dst_t>
|
756
|
+
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
752
757
|
|
753
758
|
const int i = blockIdx.x;
|
754
759
|
const block_q3_K * x = (const block_q3_K *) vx;
|
@@ -772,7 +777,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
|
|
772
777
|
float d_all = x[i].d;
|
773
778
|
float dl = d_all * (us - 32);
|
774
779
|
|
775
|
-
|
780
|
+
dst_t * y = yy + i*QK_K + 128*n + 32*j;
|
776
781
|
const uint8_t * q = x[i].qs + 32*n;
|
777
782
|
const uint8_t * hm = x[i].hmask;
|
778
783
|
|
@@ -784,7 +789,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float
|
|
784
789
|
const int im = il/8; // 0...1
|
785
790
|
const int in = il%8; // 0...7
|
786
791
|
|
787
|
-
|
792
|
+
dst_t * y = yy + i*QK_K + 16*is + il;
|
788
793
|
|
789
794
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
790
795
|
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
@@ -812,7 +817,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|
812
817
|
}
|
813
818
|
#endif
|
814
819
|
|
815
|
-
|
820
|
+
template<typename dst_t>
|
821
|
+
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
816
822
|
const block_q4_K * x = (const block_q4_K *) vx;
|
817
823
|
|
818
824
|
const int i = blockIdx.x;
|
@@ -825,7 +831,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
825
831
|
const int is = 2*il;
|
826
832
|
const int n = 4;
|
827
833
|
|
828
|
-
|
834
|
+
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
829
835
|
|
830
836
|
const float dall = __low2half(x[i].dm);
|
831
837
|
const float dmin = __high2half(x[i].dm);
|
@@ -844,7 +850,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
844
850
|
#else
|
845
851
|
const int tid = threadIdx.x;
|
846
852
|
const uint8_t * q = x[i].qs;
|
847
|
-
|
853
|
+
dst_t * y = yy + i*QK_K;
|
848
854
|
const float d = (float)x[i].dm[0];
|
849
855
|
const float m = (float)x[i].dm[1];
|
850
856
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
@@ -852,7 +858,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
852
858
|
#endif
|
853
859
|
}
|
854
860
|
|
855
|
-
|
861
|
+
template<typename dst_t>
|
862
|
+
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
856
863
|
const block_q5_K * x = (const block_q5_K *) vx;
|
857
864
|
|
858
865
|
const int i = blockIdx.x;
|
@@ -864,7 +871,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
864
871
|
const int ir = tid%16; // ir is in 0...15
|
865
872
|
const int is = 2*il; // is is in 0...6
|
866
873
|
|
867
|
-
|
874
|
+
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
|
868
875
|
|
869
876
|
const float dall = __low2half(x[i].dm);
|
870
877
|
const float dmin = __high2half(x[i].dm);
|
@@ -892,13 +899,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
892
899
|
const int is = tid/16; // 0 or 1
|
893
900
|
const uint8_t h = x[i].qh[in] >> im;
|
894
901
|
const float d = x[i].d;
|
895
|
-
|
902
|
+
dst_t * y = yy + i*QK_K + tid;
|
896
903
|
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
897
904
|
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
898
905
|
#endif
|
899
906
|
}
|
900
907
|
|
901
|
-
|
908
|
+
template<typename dst_t>
|
909
|
+
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
902
910
|
const block_q6_K * x = (const block_q6_K *) vx;
|
903
911
|
|
904
912
|
const int i = blockIdx.x;
|
@@ -910,7 +918,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
|
|
910
918
|
const int il = tid - 32*ip; // 0...32
|
911
919
|
const int is = 8*ip + il/16;
|
912
920
|
|
913
|
-
|
921
|
+
dst_t * y = yy + i*QK_K + 128*ip + il;
|
914
922
|
|
915
923
|
const float d = x[i].d;
|
916
924
|
|
@@ -929,7 +937,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float
|
|
929
937
|
const int ip = tid/16; // 0 or 1
|
930
938
|
const int il = tid - 16*ip; // 0...15
|
931
939
|
|
932
|
-
|
940
|
+
dst_t * y = yy + i*QK_K + 16*ip + il;
|
933
941
|
|
934
942
|
const float d = x[i].d;
|
935
943
|
|
@@ -1569,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1569
1577
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1570
1578
|
}
|
1571
1579
|
|
1580
|
+
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1581
|
+
static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
|
1582
|
+
const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
1583
|
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1584
|
+
|
1585
|
+
if (col >= ncols) {
|
1586
|
+
return;
|
1587
|
+
}
|
1588
|
+
|
1589
|
+
const int r = y[row];
|
1590
|
+
|
1591
|
+
// copy x[r*ncols + col] to dst[row*ncols + col]
|
1592
|
+
const int xi = r*ncols + col;
|
1593
|
+
const int di = row*ncols + col;
|
1594
|
+
|
1595
|
+
const int ib = xi/qk; // block index
|
1596
|
+
const int iqs = (xi%qk)/qr; // quant index
|
1597
|
+
const int iybs = di - di%qk; // y block start index
|
1598
|
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
1599
|
+
|
1600
|
+
// dequantize
|
1601
|
+
dfloat2 v;
|
1602
|
+
dequantize_kernel(x, ib, iqs, v);
|
1603
|
+
|
1604
|
+
dst[iybs + iqs + 0] = v.x;
|
1605
|
+
dst[iybs + iqs + y_offset] = v.y;
|
1606
|
+
}
|
1607
|
+
|
1572
1608
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1573
1609
|
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1574
1610
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
@@ -3548,7 +3584,7 @@ template <bool need_check> static __global__ void
|
|
3548
3584
|
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3549
3585
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3550
3586
|
|
3551
|
-
#elif __CUDA_ARCH__ >=
|
3587
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3552
3588
|
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3553
3589
|
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3554
3590
|
const int nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -3568,7 +3604,7 @@ template <bool need_check> static __global__ void
|
|
3568
3604
|
#else
|
3569
3605
|
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3570
3606
|
assert(false);
|
3571
|
-
#endif // __CUDA_ARCH__ >=
|
3607
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3572
3608
|
}
|
3573
3609
|
|
3574
3610
|
#define MMQ_X_Q4_1_RDNA2 64
|
@@ -3589,9 +3625,9 @@ template <bool need_check> static __global__ void
|
|
3589
3625
|
#if defined(RDNA3) || defined(RDNA2)
|
3590
3626
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
3591
3627
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3592
|
-
#elif __CUDA_ARCH__ <
|
3628
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3593
3629
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3594
|
-
#endif // __CUDA_ARCH__ <
|
3630
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3595
3631
|
mul_mat_q4_1(
|
3596
3632
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3597
3633
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3611,7 +3647,7 @@ template <bool need_check> static __global__ void
|
|
3611
3647
|
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3612
3648
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3613
3649
|
|
3614
|
-
#elif __CUDA_ARCH__ >=
|
3650
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3615
3651
|
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3616
3652
|
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3617
3653
|
const int nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -3631,7 +3667,7 @@ template <bool need_check> static __global__ void
|
|
3631
3667
|
#else
|
3632
3668
|
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3633
3669
|
assert(false);
|
3634
|
-
#endif // __CUDA_ARCH__ >=
|
3670
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3635
3671
|
}
|
3636
3672
|
|
3637
3673
|
#define MMQ_X_Q5_0_RDNA2 64
|
@@ -3672,7 +3708,7 @@ template <bool need_check> static __global__ void
|
|
3672
3708
|
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3673
3709
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3674
3710
|
|
3675
|
-
#elif __CUDA_ARCH__ >=
|
3711
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3676
3712
|
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3677
3713
|
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3678
3714
|
const int nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -3692,7 +3728,7 @@ template <bool need_check> static __global__ void
|
|
3692
3728
|
#else
|
3693
3729
|
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3694
3730
|
assert(false);
|
3695
|
-
#endif // __CUDA_ARCH__ >=
|
3731
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3696
3732
|
}
|
3697
3733
|
|
3698
3734
|
#define MMQ_X_Q5_1_RDNA2 64
|
@@ -3733,7 +3769,7 @@ mul_mat_q5_1(
|
|
3733
3769
|
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3734
3770
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3735
3771
|
|
3736
|
-
#elif __CUDA_ARCH__ >=
|
3772
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3737
3773
|
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3738
3774
|
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3739
3775
|
const int nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -3753,7 +3789,7 @@ mul_mat_q5_1(
|
|
3753
3789
|
#else
|
3754
3790
|
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3755
3791
|
assert(false);
|
3756
|
-
#endif // __CUDA_ARCH__ >=
|
3792
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3757
3793
|
}
|
3758
3794
|
|
3759
3795
|
#define MMQ_X_Q8_0_RDNA2 64
|
@@ -3794,7 +3830,7 @@ template <bool need_check> static __global__ void
|
|
3794
3830
|
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3795
3831
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3796
3832
|
|
3797
|
-
#elif __CUDA_ARCH__ >=
|
3833
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3798
3834
|
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3799
3835
|
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3800
3836
|
const int nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -3814,7 +3850,7 @@ template <bool need_check> static __global__ void
|
|
3814
3850
|
#else
|
3815
3851
|
(void) vec_dot_q8_0_q8_1_mul_mat;
|
3816
3852
|
assert(false);
|
3817
|
-
#endif // __CUDA_ARCH__ >=
|
3853
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3818
3854
|
}
|
3819
3855
|
|
3820
3856
|
#define MMQ_X_Q2_K_RDNA2 64
|
@@ -3855,7 +3891,7 @@ mul_mat_q2_K(
|
|
3855
3891
|
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3856
3892
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3857
3893
|
|
3858
|
-
#elif __CUDA_ARCH__ >=
|
3894
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3859
3895
|
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3860
3896
|
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3861
3897
|
const int nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -3875,7 +3911,7 @@ mul_mat_q2_K(
|
|
3875
3911
|
#else
|
3876
3912
|
(void) vec_dot_q2_K_q8_1_mul_mat;
|
3877
3913
|
assert(false);
|
3878
|
-
#endif // __CUDA_ARCH__ >=
|
3914
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3879
3915
|
}
|
3880
3916
|
|
3881
3917
|
#define MMQ_X_Q3_K_RDNA2 128
|
@@ -3896,9 +3932,9 @@ template <bool need_check> static __global__ void
|
|
3896
3932
|
#if defined(RDNA3) || defined(RDNA2)
|
3897
3933
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
3898
3934
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3899
|
-
#elif __CUDA_ARCH__ <
|
3935
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3900
3936
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3901
|
-
#endif // __CUDA_ARCH__ <
|
3937
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3902
3938
|
mul_mat_q3_K(
|
3903
3939
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3904
3940
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3918,7 +3954,7 @@ template <bool need_check> static __global__ void
|
|
3918
3954
|
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3919
3955
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3920
3956
|
|
3921
|
-
#elif __CUDA_ARCH__ >=
|
3957
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3922
3958
|
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3923
3959
|
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3924
3960
|
const int nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -3938,7 +3974,7 @@ template <bool need_check> static __global__ void
|
|
3938
3974
|
#else
|
3939
3975
|
(void) vec_dot_q3_K_q8_1_mul_mat;
|
3940
3976
|
assert(false);
|
3941
|
-
#endif // __CUDA_ARCH__ >=
|
3977
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3942
3978
|
}
|
3943
3979
|
|
3944
3980
|
#define MMQ_X_Q4_K_RDNA2 64
|
@@ -3959,9 +3995,9 @@ template <bool need_check> static __global__ void
|
|
3959
3995
|
#if defined(RDNA3) || defined(RDNA2)
|
3960
3996
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
3961
3997
|
#endif // defined(RDNA3) || defined(RDNA2)
|
3962
|
-
#elif __CUDA_ARCH__ <
|
3998
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
3963
3999
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3964
|
-
#endif // __CUDA_ARCH__ <
|
4000
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
3965
4001
|
mul_mat_q4_K(
|
3966
4002
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3967
4003
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -3981,7 +4017,7 @@ template <bool need_check> static __global__ void
|
|
3981
4017
|
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3982
4018
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3983
4019
|
|
3984
|
-
#elif __CUDA_ARCH__ >=
|
4020
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
3985
4021
|
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3986
4022
|
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3987
4023
|
const int nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -4001,7 +4037,7 @@ template <bool need_check> static __global__ void
|
|
4001
4037
|
#else
|
4002
4038
|
(void) vec_dot_q4_K_q8_1_mul_mat;
|
4003
4039
|
assert(false);
|
4004
|
-
#endif // __CUDA_ARCH__ >=
|
4040
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4005
4041
|
}
|
4006
4042
|
|
4007
4043
|
#define MMQ_X_Q5_K_RDNA2 64
|
@@ -4042,7 +4078,7 @@ mul_mat_q5_K(
|
|
4042
4078
|
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4043
4079
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4044
4080
|
|
4045
|
-
#elif __CUDA_ARCH__ >=
|
4081
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
4046
4082
|
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
4047
4083
|
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4048
4084
|
const int nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -4062,7 +4098,7 @@ mul_mat_q5_K(
|
|
4062
4098
|
#else
|
4063
4099
|
(void) vec_dot_q5_K_q8_1_mul_mat;
|
4064
4100
|
assert(false);
|
4065
|
-
#endif // __CUDA_ARCH__ >=
|
4101
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4066
4102
|
}
|
4067
4103
|
|
4068
4104
|
#define MMQ_X_Q6_K_RDNA2 64
|
@@ -4083,9 +4119,9 @@ template <bool need_check> static __global__ void
|
|
4083
4119
|
#if defined(RDNA3) || defined(RDNA2)
|
4084
4120
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
4085
4121
|
#endif // defined(RDNA3) || defined(RDNA2)
|
4086
|
-
#elif __CUDA_ARCH__ <
|
4122
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
4087
4123
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
4088
|
-
#endif // __CUDA_ARCH__ <
|
4124
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
4089
4125
|
mul_mat_q6_K(
|
4090
4126
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
4091
4127
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
@@ -4105,7 +4141,7 @@ template <bool need_check> static __global__ void
|
|
4105
4141
|
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4106
4142
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4107
4143
|
|
4108
|
-
#elif __CUDA_ARCH__ >=
|
4144
|
+
#elif __CUDA_ARCH__ >= CC_VOLTA
|
4109
4145
|
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
4110
4146
|
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4111
4147
|
const int nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4125,7 +4161,7 @@ template <bool need_check> static __global__ void
|
|
4125
4161
|
#else
|
4126
4162
|
(void) vec_dot_q6_K_q8_1_mul_mat;
|
4127
4163
|
assert(false);
|
4128
|
-
#endif // __CUDA_ARCH__ >=
|
4164
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4129
4165
|
}
|
4130
4166
|
|
4131
4167
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
@@ -4550,6 +4586,24 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
4550
4586
|
dst[i] = scale * x[i];
|
4551
4587
|
}
|
4552
4588
|
|
4589
|
+
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
|
4590
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
4591
|
+
|
4592
|
+
if (i >= k) {
|
4593
|
+
return;
|
4594
|
+
}
|
4595
|
+
|
4596
|
+
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4597
|
+
}
|
4598
|
+
|
4599
|
+
template<int qk, int qr, dequantize_kernel_t dq>
|
4600
|
+
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4601
|
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
4602
|
+
const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
4603
|
+
const dim3 block_nums(block_num_x, nrows, 1);
|
4604
|
+
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
4605
|
+
}
|
4606
|
+
|
4553
4607
|
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4554
4608
|
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4555
4609
|
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -4604,32 +4658,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
4604
4658
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
4605
4659
|
}
|
4606
4660
|
|
4607
|
-
|
4661
|
+
template<typename dst_t>
|
4662
|
+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4608
4663
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4609
4664
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4610
4665
|
}
|
4611
4666
|
|
4612
|
-
|
4667
|
+
template<typename dst_t>
|
4668
|
+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4613
4669
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4614
4670
|
dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4615
4671
|
}
|
4616
4672
|
|
4617
|
-
|
4673
|
+
template<typename dst_t>
|
4674
|
+
static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4618
4675
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4619
4676
|
dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4620
4677
|
}
|
4621
4678
|
|
4622
|
-
|
4679
|
+
template<typename dst_t>
|
4680
|
+
static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4623
4681
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4624
4682
|
dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4625
4683
|
}
|
4626
4684
|
|
4627
|
-
|
4685
|
+
template<typename dst_t>
|
4686
|
+
static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4628
4687
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
4629
4688
|
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4630
4689
|
}
|
4631
4690
|
|
4632
|
-
|
4691
|
+
template<typename dst_t>
|
4692
|
+
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4633
4693
|
const int nb = k / QK_K;
|
4634
4694
|
#if QK_K == 256
|
4635
4695
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4638,7 +4698,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu
|
|
4638
4698
|
#endif
|
4639
4699
|
}
|
4640
4700
|
|
4641
|
-
|
4701
|
+
template<typename dst_t>
|
4702
|
+
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4642
4703
|
const int nb = k / QK_K;
|
4643
4704
|
#if QK_K == 256
|
4644
4705
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4647,12 +4708,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu
|
|
4647
4708
|
#endif
|
4648
4709
|
}
|
4649
4710
|
|
4650
|
-
|
4711
|
+
template<typename dst_t>
|
4712
|
+
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4651
4713
|
const int nb = k / QK_K;
|
4652
4714
|
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
4653
4715
|
}
|
4654
4716
|
|
4655
|
-
|
4717
|
+
template<typename dst_t>
|
4718
|
+
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4656
4719
|
const int nb = k / QK_K;
|
4657
4720
|
#if QK_K == 256
|
4658
4721
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4661,7 +4724,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu
|
|
4661
4724
|
#endif
|
4662
4725
|
}
|
4663
4726
|
|
4664
|
-
|
4727
|
+
template<typename dst_t>
|
4728
|
+
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
4665
4729
|
const int nb = k / QK_K;
|
4666
4730
|
#if QK_K == 256
|
4667
4731
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
@@ -4868,6 +4932,26 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
|
|
4868
4932
|
|
4869
4933
|
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
4870
4934
|
switch (type) {
|
4935
|
+
case GGML_TYPE_Q4_0:
|
4936
|
+
return dequantize_row_q4_0_cuda;
|
4937
|
+
case GGML_TYPE_Q4_1:
|
4938
|
+
return dequantize_row_q4_1_cuda;
|
4939
|
+
case GGML_TYPE_Q5_0:
|
4940
|
+
return dequantize_row_q5_0_cuda;
|
4941
|
+
case GGML_TYPE_Q5_1:
|
4942
|
+
return dequantize_row_q5_1_cuda;
|
4943
|
+
case GGML_TYPE_Q8_0:
|
4944
|
+
return dequantize_row_q8_0_cuda;
|
4945
|
+
case GGML_TYPE_Q2_K:
|
4946
|
+
return dequantize_row_q2_K_cuda;
|
4947
|
+
case GGML_TYPE_Q3_K:
|
4948
|
+
return dequantize_row_q3_K_cuda;
|
4949
|
+
case GGML_TYPE_Q4_K:
|
4950
|
+
return dequantize_row_q4_K_cuda;
|
4951
|
+
case GGML_TYPE_Q5_K:
|
4952
|
+
return dequantize_row_q5_K_cuda;
|
4953
|
+
case GGML_TYPE_Q6_K:
|
4954
|
+
return dequantize_row_q6_K_cuda;
|
4871
4955
|
case GGML_TYPE_F32:
|
4872
4956
|
return convert_fp32_to_fp16_cuda;
|
4873
4957
|
default:
|
@@ -4921,7 +5005,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
4921
5005
|
mmq_x = MMQ_X_Q4_0_RDNA1;
|
4922
5006
|
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
4923
5007
|
nwarps = NWARPS_Q4_0_RDNA1;
|
4924
|
-
} else if (compute_capability >=
|
5008
|
+
} else if (compute_capability >= CC_VOLTA) {
|
4925
5009
|
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4926
5010
|
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4927
5011
|
nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -4966,7 +5050,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
4966
5050
|
mmq_x = MMQ_X_Q4_1_RDNA1;
|
4967
5051
|
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
4968
5052
|
nwarps = NWARPS_Q4_1_RDNA1;
|
4969
|
-
} else if (compute_capability >=
|
5053
|
+
} else if (compute_capability >= CC_VOLTA) {
|
4970
5054
|
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4971
5055
|
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4972
5056
|
nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -5011,7 +5095,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
5011
5095
|
mmq_x = MMQ_X_Q5_0_RDNA1;
|
5012
5096
|
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
5013
5097
|
nwarps = NWARPS_Q5_0_RDNA1;
|
5014
|
-
} else if (compute_capability >=
|
5098
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5015
5099
|
mmq_x = MMQ_X_Q5_0_AMPERE;
|
5016
5100
|
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
5017
5101
|
nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -5056,7 +5140,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
5056
5140
|
mmq_x = MMQ_X_Q5_1_RDNA1;
|
5057
5141
|
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
5058
5142
|
nwarps = NWARPS_Q5_1_RDNA1;
|
5059
|
-
} else if (compute_capability >=
|
5143
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5060
5144
|
mmq_x = MMQ_X_Q5_1_AMPERE;
|
5061
5145
|
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
5062
5146
|
nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -5101,7 +5185,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
5101
5185
|
mmq_x = MMQ_X_Q8_0_RDNA1;
|
5102
5186
|
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
5103
5187
|
nwarps = NWARPS_Q8_0_RDNA1;
|
5104
|
-
} else if (compute_capability >=
|
5188
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5105
5189
|
mmq_x = MMQ_X_Q8_0_AMPERE;
|
5106
5190
|
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
5107
5191
|
nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -5146,7 +5230,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
5146
5230
|
mmq_x = MMQ_X_Q2_K_RDNA1;
|
5147
5231
|
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
5148
5232
|
nwarps = NWARPS_Q2_K_RDNA1;
|
5149
|
-
} else if (compute_capability >=
|
5233
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5150
5234
|
mmq_x = MMQ_X_Q2_K_AMPERE;
|
5151
5235
|
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
5152
5236
|
nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -5193,7 +5277,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
5193
5277
|
mmq_x = MMQ_X_Q3_K_RDNA1;
|
5194
5278
|
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
5195
5279
|
nwarps = NWARPS_Q3_K_RDNA1;
|
5196
|
-
} else if (compute_capability >=
|
5280
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5197
5281
|
mmq_x = MMQ_X_Q3_K_AMPERE;
|
5198
5282
|
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
5199
5283
|
nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -5239,7 +5323,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
5239
5323
|
mmq_x = MMQ_X_Q4_K_RDNA1;
|
5240
5324
|
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
5241
5325
|
nwarps = NWARPS_Q4_K_RDNA1;
|
5242
|
-
} else if (compute_capability >=
|
5326
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5243
5327
|
mmq_x = MMQ_X_Q4_K_AMPERE;
|
5244
5328
|
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
5245
5329
|
nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -5284,7 +5368,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
5284
5368
|
mmq_x = MMQ_X_Q5_K_RDNA1;
|
5285
5369
|
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
5286
5370
|
nwarps = NWARPS_Q5_K_RDNA1;
|
5287
|
-
} else if (compute_capability >=
|
5371
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5288
5372
|
mmq_x = MMQ_X_Q5_K_AMPERE;
|
5289
5373
|
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
5290
5374
|
nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -5329,7 +5413,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
5329
5413
|
mmq_x = MMQ_X_Q6_K_RDNA1;
|
5330
5414
|
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
5331
5415
|
nwarps = NWARPS_Q6_K_RDNA1;
|
5332
|
-
} else if (compute_capability >=
|
5416
|
+
} else if (compute_capability >= CC_VOLTA) {
|
5333
5417
|
mmq_x = MMQ_X_Q6_K_AMPERE;
|
5334
5418
|
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
5335
5419
|
nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -5401,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
5401
5485
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5402
5486
|
}
|
5403
5487
|
|
5488
|
+
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
5489
|
+
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
|
5490
|
+
clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
5491
|
+
}
|
5492
|
+
|
5404
5493
|
template<typename T>
|
5405
5494
|
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5406
5495
|
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
@@ -5668,7 +5757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5668
5757
|
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5669
5758
|
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5670
5759
|
kind = cudaMemcpyDeviceToDevice;
|
5671
|
-
|
5760
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5672
5761
|
int id;
|
5673
5762
|
CUDA_CHECK(cudaGetDevice(&id));
|
5674
5763
|
src_ptr = (char *) extra->data_device[id];
|
@@ -5704,6 +5793,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5704
5793
|
}
|
5705
5794
|
}
|
5706
5795
|
|
5796
|
+
static void ggml_cuda_op_repeat(
|
5797
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5798
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
5799
|
+
// guaranteed to be an integer due to the check in ggml_can_repeat
|
5800
|
+
const int64_t ne0 = dst->ne[0];
|
5801
|
+
const int64_t ne1 = dst->ne[1];
|
5802
|
+
const int64_t ne2 = dst->ne[2];
|
5803
|
+
const int64_t ne3 = dst->ne[3];
|
5804
|
+
|
5805
|
+
const int64_t ne00 = src0->ne[0];
|
5806
|
+
const int64_t ne01 = src0->ne[1];
|
5807
|
+
const int64_t ne02 = src0->ne[2];
|
5808
|
+
const int64_t ne03 = src0->ne[3];
|
5809
|
+
|
5810
|
+
const size_t nb0 = dst->nb[0];
|
5811
|
+
const size_t nb1 = dst->nb[1];
|
5812
|
+
const size_t nb2 = dst->nb[2];
|
5813
|
+
const size_t nb3 = dst->nb[3];
|
5814
|
+
|
5815
|
+
const size_t nb00 = src0->nb[0];
|
5816
|
+
const size_t nb01 = src0->nb[1];
|
5817
|
+
const size_t nb02 = src0->nb[2];
|
5818
|
+
const size_t nb03 = src0->nb[3];
|
5819
|
+
|
5820
|
+
const int nr0 = (int)(ne0/ne00);
|
5821
|
+
const int nr1 = (int)(ne1/ne01);
|
5822
|
+
const int nr2 = (int)(ne2/ne02);
|
5823
|
+
const int nr3 = (int)(ne3/ne03);
|
5824
|
+
|
5825
|
+
// TODO: support for transposed / permuted tensors
|
5826
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
5827
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
5828
|
+
|
5829
|
+
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
5830
|
+
for (int i3 = 0; i3 < nr3; i3++) {
|
5831
|
+
for (int k3 = 0; k3 < ne03; k3++) {
|
5832
|
+
for (int i2 = 0; i2 < nr2; i2++) {
|
5833
|
+
for (int k2 = 0; k2 < ne02; k2++) {
|
5834
|
+
for (int i1 = 0; i1 < nr1; i1++) {
|
5835
|
+
for (int k1 = 0; k1 < ne01; k1++) {
|
5836
|
+
for (int i0 = 0; i0 < nr0; i0++) {
|
5837
|
+
CUDA_CHECK(cudaMemcpyAsync(
|
5838
|
+
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
5839
|
+
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
5840
|
+
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
5841
|
+
}
|
5842
|
+
}
|
5843
|
+
}
|
5844
|
+
}
|
5845
|
+
}
|
5846
|
+
}
|
5847
|
+
}
|
5848
|
+
|
5849
|
+
(void) src1;
|
5850
|
+
(void) src1_d;
|
5851
|
+
}
|
5852
|
+
|
5853
|
+
static void ggml_cuda_op_get_rows(
|
5854
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5855
|
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
5856
|
+
|
5857
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
5858
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
5859
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
5860
|
+
GGML_ASSERT(ggml_is_contiguous(src1));
|
5861
|
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
5862
|
+
|
5863
|
+
const int ncols = src0->ne[0];
|
5864
|
+
const int nrows = ggml_nelements(src1);
|
5865
|
+
|
5866
|
+
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
5867
|
+
|
5868
|
+
switch (src0->type) {
|
5869
|
+
case GGML_TYPE_F16:
|
5870
|
+
get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5871
|
+
break;
|
5872
|
+
case GGML_TYPE_F32:
|
5873
|
+
get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5874
|
+
break;
|
5875
|
+
case GGML_TYPE_Q4_0:
|
5876
|
+
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5877
|
+
break;
|
5878
|
+
case GGML_TYPE_Q4_1:
|
5879
|
+
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5880
|
+
break;
|
5881
|
+
case GGML_TYPE_Q5_0:
|
5882
|
+
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5883
|
+
break;
|
5884
|
+
case GGML_TYPE_Q5_1:
|
5885
|
+
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5886
|
+
break;
|
5887
|
+
case GGML_TYPE_Q8_0:
|
5888
|
+
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5889
|
+
break;
|
5890
|
+
default:
|
5891
|
+
// TODO: k-quants
|
5892
|
+
GGML_ASSERT(false);
|
5893
|
+
break;
|
5894
|
+
}
|
5895
|
+
}
|
5896
|
+
|
5707
5897
|
inline void ggml_cuda_op_add(
|
5708
5898
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5709
5899
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -5907,7 +6097,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5907
6097
|
switch(type) {
|
5908
6098
|
case GGML_TYPE_Q4_0:
|
5909
6099
|
case GGML_TYPE_Q4_1:
|
5910
|
-
return max_compute_capability >=
|
6100
|
+
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
5911
6101
|
case GGML_TYPE_Q5_0:
|
5912
6102
|
case GGML_TYPE_Q5_1:
|
5913
6103
|
case GGML_TYPE_Q8_0:
|
@@ -5918,7 +6108,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5918
6108
|
case GGML_TYPE_Q3_K:
|
5919
6109
|
case GGML_TYPE_Q4_K:
|
5920
6110
|
case GGML_TYPE_Q5_K:
|
5921
|
-
return max_compute_capability >=
|
6111
|
+
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
5922
6112
|
case GGML_TYPE_Q6_K:
|
5923
6113
|
return 64;
|
5924
6114
|
default:
|
@@ -6083,8 +6273,19 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6083
6273
|
|
6084
6274
|
const int compute_capability = g_compute_capabilities[id];
|
6085
6275
|
|
6086
|
-
if (compute_capability >=
|
6087
|
-
// convert src1 to fp16, multiply as fp16, convert dst to fp32
|
6276
|
+
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
6277
|
+
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
6278
|
+
half * src0_as_f16 = nullptr;
|
6279
|
+
size_t src0_as = 0;
|
6280
|
+
if (src0->type != GGML_TYPE_F16) {
|
6281
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
6282
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
6283
|
+
size_t ne = row_diff*ne00;
|
6284
|
+
src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
|
6285
|
+
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
|
6286
|
+
}
|
6287
|
+
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
|
6288
|
+
|
6088
6289
|
half * src1_as_f16 = nullptr;
|
6089
6290
|
size_t src1_as = 0;
|
6090
6291
|
if (src1->type != GGML_TYPE_F16) {
|
@@ -6106,9 +6307,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6106
6307
|
CUBLAS_CHECK(
|
6107
6308
|
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6108
6309
|
row_diff, src1_ncols, ne10,
|
6109
|
-
&alpha_f16,
|
6110
|
-
src1_ptr,
|
6111
|
-
&beta_f16, dst_f16,
|
6310
|
+
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
6311
|
+
src1_ptr, CUDA_R_16F, ne10,
|
6312
|
+
&beta_f16, dst_f16, CUDA_R_16F, ldc,
|
6112
6313
|
CUBLAS_COMPUTE_16F,
|
6113
6314
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
6114
6315
|
|
@@ -6117,6 +6318,10 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6117
6318
|
|
6118
6319
|
ggml_cuda_pool_free(dst_f16, dst_as);
|
6119
6320
|
|
6321
|
+
if (src0_as != 0) {
|
6322
|
+
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
6323
|
+
}
|
6324
|
+
|
6120
6325
|
if (src1_as != 0) {
|
6121
6326
|
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
6122
6327
|
}
|
@@ -6229,12 +6434,12 @@ inline void ggml_cuda_op_alibi(
|
|
6229
6434
|
const int64_t ne02 = src0->ne[2];
|
6230
6435
|
const int64_t nrows = ggml_nrows(src0);
|
6231
6436
|
|
6232
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
6437
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6233
6438
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
6234
6439
|
float max_bias;
|
6235
6440
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
6236
6441
|
|
6237
|
-
GGML_ASSERT(ne01 + n_past == ne00);
|
6442
|
+
//GGML_ASSERT(ne01 + n_past == ne00);
|
6238
6443
|
GGML_ASSERT(n_head == ne02);
|
6239
6444
|
|
6240
6445
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
@@ -6293,7 +6498,14 @@ inline void ggml_cuda_op_scale(
|
|
6293
6498
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6294
6499
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6295
6500
|
|
6296
|
-
|
6501
|
+
float scale;
|
6502
|
+
// HACK: support for ggml backend interface
|
6503
|
+
if (src1->backend == GGML_BACKEND_CPU) {
|
6504
|
+
scale = ((float *) src1->data)[0];
|
6505
|
+
} else {
|
6506
|
+
// TODO: pass pointer to kernel instead of copying to host
|
6507
|
+
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
6508
|
+
}
|
6297
6509
|
|
6298
6510
|
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
6299
6511
|
CUDA_CHECK(cudaGetLastError());
|
@@ -6303,6 +6515,24 @@ inline void ggml_cuda_op_scale(
|
|
6303
6515
|
(void) src1_dd;
|
6304
6516
|
}
|
6305
6517
|
|
6518
|
+
inline void ggml_cuda_op_clamp(
|
6519
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6520
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6521
|
+
|
6522
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6523
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6524
|
+
|
6525
|
+
const float min = ((float *) dst->op_params)[0];
|
6526
|
+
const float max = ((float *) dst->op_params)[1];
|
6527
|
+
|
6528
|
+
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6529
|
+
CUDA_CHECK(cudaGetLastError());
|
6530
|
+
|
6531
|
+
(void) src1;
|
6532
|
+
(void) dst;
|
6533
|
+
(void) src1_dd;
|
6534
|
+
}
|
6535
|
+
|
6306
6536
|
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6307
6537
|
const int64_t nrows0 = ggml_nrows(src0);
|
6308
6538
|
|
@@ -6312,9 +6542,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6312
6542
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6313
6543
|
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6314
6544
|
|
6315
|
-
|
6316
|
-
|
6317
|
-
|
6545
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6546
|
+
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6547
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6318
6548
|
|
6319
6549
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6320
6550
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
@@ -6455,9 +6685,9 @@ static void ggml_cuda_op_mul_mat(
|
|
6455
6685
|
const size_t q8_1_ts = sizeof(block_q8_1);
|
6456
6686
|
const size_t q8_1_bs = QK8_1;
|
6457
6687
|
|
6458
|
-
|
6459
|
-
|
6460
|
-
|
6688
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6689
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6690
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6461
6691
|
|
6462
6692
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6463
6693
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
@@ -6535,7 +6765,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6535
6765
|
if (convert_src1_to_q8_1) {
|
6536
6766
|
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6537
6767
|
|
6538
|
-
if (
|
6768
|
+
if (src1_on_device && src1_is_contiguous) {
|
6539
6769
|
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6540
6770
|
CUDA_CHECK(cudaGetLastError());
|
6541
6771
|
}
|
@@ -6617,7 +6847,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6617
6847
|
GGML_ASSERT(false);
|
6618
6848
|
}
|
6619
6849
|
|
6620
|
-
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
6850
|
+
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
6621
6851
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6622
6852
|
CUDA_CHECK(cudaGetLastError());
|
6623
6853
|
}
|
@@ -6708,6 +6938,14 @@ static void ggml_cuda_op_mul_mat(
|
|
6708
6938
|
}
|
6709
6939
|
}
|
6710
6940
|
|
6941
|
+
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6942
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
|
6943
|
+
}
|
6944
|
+
|
6945
|
+
static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6946
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
|
6947
|
+
}
|
6948
|
+
|
6711
6949
|
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6712
6950
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6713
6951
|
}
|
@@ -6762,13 +7000,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
6762
7000
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6763
7001
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6764
7002
|
|
6765
|
-
|
7003
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6766
7004
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
6767
7005
|
|
6768
|
-
|
7006
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6769
7007
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6770
7008
|
|
6771
|
-
|
7009
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6772
7010
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6773
7011
|
|
6774
7012
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
@@ -6793,13 +7031,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
6793
7031
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6794
7032
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6795
7033
|
|
6796
|
-
|
7034
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6797
7035
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
6798
7036
|
|
6799
|
-
|
7037
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6800
7038
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6801
7039
|
|
6802
|
-
|
7040
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6803
7041
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6804
7042
|
|
6805
7043
|
const int64_t row_stride_x = nb01 / sizeof(half);
|
@@ -6820,11 +7058,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6820
7058
|
}
|
6821
7059
|
}
|
6822
7060
|
|
6823
|
-
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7061
|
+
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
6824
7062
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
6825
7063
|
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
6826
7064
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
6827
|
-
}else if (src0->type == GGML_TYPE_F32) {
|
7065
|
+
} else if (src0->type == GGML_TYPE_F32) {
|
6828
7066
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6829
7067
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
6830
7068
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
@@ -6856,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
6856
7094
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6857
7095
|
}
|
6858
7096
|
|
7097
|
+
static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7098
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
|
7099
|
+
}
|
7100
|
+
|
6859
7101
|
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6860
7102
|
const int64_t ne = ggml_nelements(src0);
|
6861
7103
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
@@ -6885,8 +7127,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
6885
7127
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6886
7128
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6887
7129
|
|
6888
|
-
const
|
6889
|
-
const
|
7130
|
+
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7131
|
+
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6890
7132
|
|
6891
7133
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6892
7134
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
@@ -6941,8 +7183,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6941
7183
|
|
6942
7184
|
const size_t nb1 = tensor->nb[1];
|
6943
7185
|
|
6944
|
-
|
6945
|
-
|
7186
|
+
ggml_backend_type backend = tensor->backend;
|
7187
|
+
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
6946
7188
|
memset(extra, 0, sizeof(*extra));
|
6947
7189
|
|
6948
7190
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
@@ -6996,7 +7238,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6996
7238
|
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
6997
7239
|
}
|
6998
7240
|
|
6999
|
-
|
7000
7241
|
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
7001
7242
|
|
7002
7243
|
extra->data_device[id] = buf;
|
@@ -7035,17 +7276,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
7035
7276
|
delete extra;
|
7036
7277
|
}
|
7037
7278
|
|
7038
|
-
static
|
7279
|
+
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
7039
7280
|
static size_t g_temp_tensor_extra_index = 0;
|
7040
7281
|
|
7041
|
-
static
|
7282
|
+
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7042
7283
|
if (g_temp_tensor_extras == nullptr) {
|
7043
7284
|
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7044
7285
|
}
|
7045
7286
|
|
7046
7287
|
size_t alloc_index = g_temp_tensor_extra_index;
|
7047
7288
|
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7048
|
-
|
7289
|
+
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7049
7290
|
memset(extra, 0, sizeof(*extra));
|
7050
7291
|
|
7051
7292
|
return extra;
|
@@ -7073,7 +7314,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7073
7314
|
return;
|
7074
7315
|
}
|
7075
7316
|
|
7076
|
-
|
7317
|
+
ggml_tensor_extra_gpu * extra;
|
7077
7318
|
|
7078
7319
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7079
7320
|
tensor->op == GGML_OP_VIEW ||
|
@@ -7082,7 +7323,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7082
7323
|
|
7083
7324
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7084
7325
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7085
|
-
|
7326
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7086
7327
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7087
7328
|
size_t offset = 0;
|
7088
7329
|
if (tensor->op == GGML_OP_VIEW) {
|
@@ -7091,7 +7332,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7091
7332
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7092
7333
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
7093
7334
|
} else if (tensor->op == GGML_OP_CPY) {
|
7094
|
-
|
7335
|
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
7095
7336
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
7096
7337
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7097
7338
|
extra->data_device[g_main_device] = src1_ddv;
|
@@ -7133,13 +7374,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
7133
7374
|
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
7134
7375
|
}
|
7135
7376
|
|
7136
|
-
|
7377
|
+
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
7137
7378
|
|
7138
7379
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7139
7380
|
tensor->op == GGML_OP_VIEW;
|
7140
7381
|
|
7141
7382
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7142
|
-
|
7383
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7143
7384
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7144
7385
|
size_t view_offset = 0;
|
7145
7386
|
if (tensor->op == GGML_OP_VIEW) {
|
@@ -7157,7 +7398,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
|
7157
7398
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7158
7399
|
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7159
7400
|
|
7160
|
-
|
7401
|
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7161
7402
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7162
7403
|
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7163
7404
|
}
|
@@ -7214,58 +7455,47 @@ void ggml_cuda_free_scratch() {
|
|
7214
7455
|
g_scratch_buffer = nullptr;
|
7215
7456
|
}
|
7216
7457
|
|
7217
|
-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
7458
|
+
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
7218
7459
|
ggml_cuda_func_t func;
|
7219
7460
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7220
7461
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
7221
7462
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
7222
7463
|
|
7464
|
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
7465
|
+
return false;
|
7466
|
+
}
|
7467
|
+
|
7223
7468
|
switch (tensor->op) {
|
7469
|
+
case GGML_OP_REPEAT:
|
7470
|
+
func = ggml_cuda_repeat;
|
7471
|
+
break;
|
7472
|
+
case GGML_OP_GET_ROWS:
|
7473
|
+
func = ggml_cuda_get_rows;
|
7474
|
+
break;
|
7224
7475
|
case GGML_OP_DUP:
|
7225
|
-
if (!any_on_device) {
|
7226
|
-
return false;
|
7227
|
-
}
|
7228
7476
|
func = ggml_cuda_dup;
|
7229
7477
|
break;
|
7230
7478
|
case GGML_OP_ADD:
|
7231
|
-
if (!any_on_device) {
|
7232
|
-
return false;
|
7233
|
-
}
|
7234
7479
|
func = ggml_cuda_add;
|
7235
7480
|
break;
|
7236
7481
|
case GGML_OP_MUL:
|
7237
|
-
if (!any_on_device) {
|
7238
|
-
return false;
|
7239
|
-
}
|
7240
7482
|
func = ggml_cuda_mul;
|
7241
7483
|
break;
|
7242
7484
|
case GGML_OP_UNARY:
|
7243
7485
|
switch (ggml_get_unary_op(tensor)) {
|
7244
7486
|
case GGML_UNARY_OP_GELU:
|
7245
|
-
if (!any_on_device) {
|
7246
|
-
return false;
|
7247
|
-
}
|
7248
7487
|
func = ggml_cuda_gelu;
|
7249
7488
|
break;
|
7250
7489
|
case GGML_UNARY_OP_SILU:
|
7251
|
-
if (!any_on_device) {
|
7252
|
-
return false;
|
7253
|
-
}
|
7254
7490
|
func = ggml_cuda_silu;
|
7255
7491
|
break;
|
7256
7492
|
default:
|
7257
7493
|
return false;
|
7258
7494
|
} break;
|
7259
7495
|
case GGML_OP_NORM:
|
7260
|
-
if (!any_on_device) {
|
7261
|
-
return false;
|
7262
|
-
}
|
7263
7496
|
func = ggml_cuda_norm;
|
7264
7497
|
break;
|
7265
7498
|
case GGML_OP_RMS_NORM:
|
7266
|
-
if (!any_on_device) {
|
7267
|
-
return false;
|
7268
|
-
}
|
7269
7499
|
func = ggml_cuda_rms_norm;
|
7270
7500
|
break;
|
7271
7501
|
case GGML_OP_MUL_MAT:
|
@@ -7275,54 +7505,36 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7275
7505
|
func = ggml_cuda_mul_mat;
|
7276
7506
|
break;
|
7277
7507
|
case GGML_OP_SCALE:
|
7278
|
-
if (!any_on_device) {
|
7279
|
-
return false;
|
7280
|
-
}
|
7281
7508
|
func = ggml_cuda_scale;
|
7282
7509
|
break;
|
7283
|
-
case
|
7510
|
+
case GGML_OP_CLAMP:
|
7284
7511
|
if (!any_on_device) {
|
7285
7512
|
return false;
|
7286
7513
|
}
|
7514
|
+
func = ggml_cuda_clamp;
|
7515
|
+
break;
|
7516
|
+
case GGML_OP_CPY:
|
7287
7517
|
func = ggml_cuda_cpy;
|
7288
7518
|
break;
|
7289
7519
|
case GGML_OP_CONT:
|
7290
|
-
if (!any_on_device) {
|
7291
|
-
return false;
|
7292
|
-
}
|
7293
7520
|
func = ggml_cuda_dup;
|
7294
7521
|
break;
|
7295
7522
|
case GGML_OP_RESHAPE:
|
7296
7523
|
case GGML_OP_VIEW:
|
7297
7524
|
case GGML_OP_PERMUTE:
|
7298
7525
|
case GGML_OP_TRANSPOSE:
|
7299
|
-
if (!any_on_device) {
|
7300
|
-
return false;
|
7301
|
-
}
|
7302
7526
|
func = ggml_cuda_nop;
|
7303
7527
|
break;
|
7304
7528
|
case GGML_OP_DIAG_MASK_INF:
|
7305
|
-
if (!any_on_device) {
|
7306
|
-
return false;
|
7307
|
-
}
|
7308
7529
|
func = ggml_cuda_diag_mask_inf;
|
7309
7530
|
break;
|
7310
7531
|
case GGML_OP_SOFT_MAX:
|
7311
|
-
if (!any_on_device) {
|
7312
|
-
return false;
|
7313
|
-
}
|
7314
7532
|
func = ggml_cuda_soft_max;
|
7315
7533
|
break;
|
7316
7534
|
case GGML_OP_ROPE:
|
7317
|
-
if (!any_on_device) {
|
7318
|
-
return false;
|
7319
|
-
}
|
7320
7535
|
func = ggml_cuda_rope;
|
7321
7536
|
break;
|
7322
7537
|
case GGML_OP_ALIBI:
|
7323
|
-
if (!any_on_device) {
|
7324
|
-
return false;
|
7325
|
-
}
|
7326
7538
|
func = ggml_cuda_alibi;
|
7327
7539
|
break;
|
7328
7540
|
default:
|
@@ -7350,3 +7562,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
7350
7562
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
7351
7563
|
snprintf(description, description_size, "%s", prop.name);
|
7352
7564
|
}
|
7565
|
+
|
7566
|
+
////////////////////////////////////////////////////////////////////////////////
|
7567
|
+
|
7568
|
+
// backend interface
|
7569
|
+
|
7570
|
+
#define UNUSED GGML_UNUSED
|
7571
|
+
|
7572
|
+
struct ggml_backend_context_cuda {
|
7573
|
+
};
|
7574
|
+
|
7575
|
+
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
7576
|
+
return GGML_CUDA_NAME;
|
7577
|
+
|
7578
|
+
UNUSED(backend);
|
7579
|
+
}
|
7580
|
+
|
7581
|
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
7582
|
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
7583
|
+
delete cuda_ctx;
|
7584
|
+
delete backend;
|
7585
|
+
}
|
7586
|
+
|
7587
|
+
struct ggml_backend_buffer_context_cuda {
|
7588
|
+
void * device;
|
7589
|
+
|
7590
|
+
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
7591
|
+
size_t temp_tensor_extra_index = 0;
|
7592
|
+
|
7593
|
+
~ggml_backend_buffer_context_cuda() {
|
7594
|
+
delete[] temp_tensor_extras;
|
7595
|
+
}
|
7596
|
+
|
7597
|
+
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7598
|
+
if (temp_tensor_extras == nullptr) {
|
7599
|
+
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7600
|
+
}
|
7601
|
+
|
7602
|
+
size_t alloc_index = temp_tensor_extra_index;
|
7603
|
+
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7604
|
+
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
7605
|
+
memset(extra, 0, sizeof(*extra));
|
7606
|
+
|
7607
|
+
return extra;
|
7608
|
+
}
|
7609
|
+
};
|
7610
|
+
|
7611
|
+
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
7612
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7613
|
+
CUDA_CHECK(cudaFree(ctx->device));
|
7614
|
+
delete ctx;
|
7615
|
+
}
|
7616
|
+
|
7617
|
+
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
7618
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7619
|
+
return ctx->device;
|
7620
|
+
}
|
7621
|
+
|
7622
|
+
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
7623
|
+
int64_t row_low = 0;
|
7624
|
+
int64_t row_high = ggml_nrows(tensor);
|
7625
|
+
int64_t nrows_split = row_high - row_low;
|
7626
|
+
|
7627
|
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
7628
|
+
|
7629
|
+
int64_t ne0 = tensor->ne[0];
|
7630
|
+
|
7631
|
+
if (ggml_is_quantized(tensor->type)) {
|
7632
|
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
7633
|
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
7634
|
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
7635
|
+
}
|
7636
|
+
}
|
7637
|
+
|
7638
|
+
return size;
|
7639
|
+
|
7640
|
+
UNUSED(buffer);
|
7641
|
+
}
|
7642
|
+
|
7643
|
+
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
7644
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7645
|
+
|
7646
|
+
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
7647
|
+
assert(tensor->view_src->buffer->backend == buffer->backend);
|
7648
|
+
tensor->backend = tensor->view_src->backend;
|
7649
|
+
tensor->extra = tensor->view_src->extra;
|
7650
|
+
return;
|
7651
|
+
}
|
7652
|
+
|
7653
|
+
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
7654
|
+
|
7655
|
+
extra->data_device[g_main_device] = tensor->data;
|
7656
|
+
|
7657
|
+
tensor->backend = GGML_BACKEND_GPU;
|
7658
|
+
tensor->extra = extra;
|
7659
|
+
|
7660
|
+
if (ggml_is_quantized(tensor->type)) {
|
7661
|
+
// initialize padding to 0 to avoid possible NaN values
|
7662
|
+
int64_t row_low = 0;
|
7663
|
+
int64_t row_high = ggml_nrows(tensor);
|
7664
|
+
int64_t nrows_split = row_high - row_low;
|
7665
|
+
|
7666
|
+
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
7667
|
+
size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
|
7668
|
+
|
7669
|
+
if (padded_size > original_size && tensor->view_src == nullptr) {
|
7670
|
+
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
|
7671
|
+
}
|
7672
|
+
}
|
7673
|
+
|
7674
|
+
UNUSED(buffer);
|
7675
|
+
}
|
7676
|
+
|
7677
|
+
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
7678
|
+
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
7679
|
+
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
7680
|
+
/* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
|
7681
|
+
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
7682
|
+
/* .free_tensor = */ NULL,
|
7683
|
+
};
|
7684
|
+
|
7685
|
+
static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
|
7686
|
+
ggml_cuda_set_device(g_main_device);
|
7687
|
+
|
7688
|
+
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
7689
|
+
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
7690
|
+
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
7691
|
+
}
|
7692
|
+
|
7693
|
+
static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
|
7694
|
+
return 128;
|
7695
|
+
UNUSED(backend);
|
7696
|
+
}
|
7697
|
+
|
7698
|
+
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
7699
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
7700
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
7701
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7702
|
+
|
7703
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
|
7704
|
+
|
7705
|
+
UNUSED(backend);
|
7706
|
+
}
|
7707
|
+
|
7708
|
+
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
7709
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
7710
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
7711
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7712
|
+
|
7713
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
7714
|
+
|
7715
|
+
UNUSED(backend);
|
7716
|
+
}
|
7717
|
+
|
7718
|
+
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
7719
|
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
7720
|
+
|
7721
|
+
UNUSED(backend);
|
7722
|
+
}
|
7723
|
+
|
7724
|
+
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
7725
|
+
GGML_ASSERT(!"not implemented");
|
7726
|
+
|
7727
|
+
return nullptr;
|
7728
|
+
|
7729
|
+
UNUSED(backend);
|
7730
|
+
UNUSED(cgraph);
|
7731
|
+
}
|
7732
|
+
|
7733
|
+
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
7734
|
+
GGML_ASSERT(!"not implemented");
|
7735
|
+
|
7736
|
+
UNUSED(backend);
|
7737
|
+
UNUSED(plan);
|
7738
|
+
}
|
7739
|
+
|
7740
|
+
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
7741
|
+
GGML_ASSERT(!"not implemented");
|
7742
|
+
|
7743
|
+
UNUSED(backend);
|
7744
|
+
UNUSED(plan);
|
7745
|
+
}
|
7746
|
+
|
7747
|
+
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
7748
|
+
ggml_cuda_set_device(g_main_device);
|
7749
|
+
|
7750
|
+
ggml_compute_params params = {};
|
7751
|
+
params.type = GGML_TASK_COMPUTE;
|
7752
|
+
params.ith = 0;
|
7753
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
7754
|
+
ggml_tensor * node = cgraph->nodes[i];
|
7755
|
+
|
7756
|
+
assert(node->backend == GGML_BACKEND_GPU);
|
7757
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
7758
|
+
if (node->src[j] != nullptr) {
|
7759
|
+
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
7760
|
+
}
|
7761
|
+
}
|
7762
|
+
|
7763
|
+
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
7764
|
+
if (!ok) {
|
7765
|
+
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
7766
|
+
}
|
7767
|
+
GGML_ASSERT(ok);
|
7768
|
+
|
7769
|
+
#if 0
|
7770
|
+
if (node->type == GGML_TYPE_F32) {
|
7771
|
+
cudaDeviceSynchronize();
|
7772
|
+
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
7773
|
+
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
7774
|
+
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
7775
|
+
ggml_type_name(node->src[0]->type),
|
7776
|
+
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
7777
|
+
node->src[0]->name,
|
7778
|
+
node->src[1] ? node->src[1]->name : "none");
|
7779
|
+
double sum = 0.0;
|
7780
|
+
double sq_sum = 0.0;
|
7781
|
+
for (int i = 0; i < ggml_nelements(node); i++) {
|
7782
|
+
printf("%f ", tmp[i]);
|
7783
|
+
sum += tmp[i];
|
7784
|
+
sq_sum += tmp[i]*tmp[i];
|
7785
|
+
}
|
7786
|
+
printf("\n");
|
7787
|
+
printf("sum: %f, ", sum);
|
7788
|
+
printf("sq_sum: %f\n", sq_sum);
|
7789
|
+
}
|
7790
|
+
#endif
|
7791
|
+
}
|
7792
|
+
|
7793
|
+
UNUSED(backend);
|
7794
|
+
}
|
7795
|
+
|
7796
|
+
static ggml_backend_i cuda_backend_i = {
|
7797
|
+
/* .get_name = */ ggml_backend_cuda_name,
|
7798
|
+
/* .free = */ ggml_backend_cuda_free,
|
7799
|
+
/* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
|
7800
|
+
/* .get_alignment = */ ggml_backend_cuda_get_alignment,
|
7801
|
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
7802
|
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
7803
|
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
7804
|
+
/* .cpy_tensor_from = */ nullptr,
|
7805
|
+
/* .cpy_tensor_to = */ nullptr,
|
7806
|
+
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
7807
|
+
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
7808
|
+
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
7809
|
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
7810
|
+
/* .supports_op = */ nullptr,
|
7811
|
+
};
|
7812
|
+
|
7813
|
+
ggml_backend_t ggml_backend_cuda_init() {
|
7814
|
+
ggml_init_cublas(); // TODO: remove from ggml.c
|
7815
|
+
|
7816
|
+
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
|
7817
|
+
|
7818
|
+
ggml_backend_t cuda_backend = new ggml_backend {
|
7819
|
+
/* .interface = */ cuda_backend_i,
|
7820
|
+
/* .context = */ ctx
|
7821
|
+
};
|
7822
|
+
|
7823
|
+
return cuda_backend;
|
7824
|
+
}
|